mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 05:20:52 +00:00
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system. Key changes: - Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask) - Add support for per-set metadata and ID management in kmer set groups - Implement ability to append to existing kmer index groups - Refactor option parsing to use a global options registration system - Add new commands for listing, copying, moving, and removing sets - Enhance low-complexity masking with new options and output formats - Improve kmer index summary with Jaccard distance matrix support - Remove deprecated obikindex and obisuperkmer commands - Update build process to use the new subcommand structure
This commit is contained in:
34
cmd/obitools/obik/main.go
Normal file
34
cmd/obitools/obik/main.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obik"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
func main() {
|
||||
defer obiseq.LogBioSeqStatus()
|
||||
|
||||
opt, parser := obioptions.GenerateSubcommandParser(
|
||||
"obik",
|
||||
"Manage disk-based kmer indices",
|
||||
obik.OptionSet,
|
||||
)
|
||||
|
||||
_, remaining := parser(os.Args)
|
||||
|
||||
err := opt.Dispatch(context.Background(), remaining)
|
||||
if err != nil {
|
||||
if errors.Is(err, getoptions.ErrorHelpCalled) {
|
||||
os.Exit(0)
|
||||
}
|
||||
log.Fatalf("Error: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obikindex"
|
||||
)
|
||||
|
||||
func main() {
|
||||
optionParser := obioptions.GenerateOptionParser(
|
||||
"obikindex",
|
||||
"builds a disk-based kmer index from sequence files",
|
||||
obikindex.OptionSet)
|
||||
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||
|
||||
obikindex.CLIBuildKmerIndex(sequences)
|
||||
}
|
||||
@@ -1,47 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obilowmask"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
defer obiseq.LogBioSeqStatus()
|
||||
|
||||
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
|
||||
// f, err := os.Create("cpu.pprof")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// pprof.StartCPUProfile(f)
|
||||
// defer pprof.StopCPUProfile()
|
||||
|
||||
// go tool trace cpu.trace
|
||||
// ftrace, err := os.Create("cpu.trace")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// trace.Start(ftrace)
|
||||
// defer trace.Stop()
|
||||
|
||||
optionParser := obioptions.GenerateOptionParser(
|
||||
"obimicrosat",
|
||||
"looks for microsatellites sequences in a sequence file",
|
||||
obilowmask.OptionSet)
|
||||
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||
|
||||
selected := obilowmask.CLISequenceEntropyMasker(sequences)
|
||||
obiconvert.CLIWriteBioSequences(selected, true)
|
||||
obiutils.WaitForLastPipe()
|
||||
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obisuperkmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Generate option parser
|
||||
optionParser := obioptions.GenerateOptionParser(
|
||||
"obisuperkmer",
|
||||
"extract super k-mers from sequence files",
|
||||
obisuperkmer.OptionSet)
|
||||
|
||||
// Parse command-line arguments
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
// Read input sequences
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||
|
||||
// Extract super k-mers
|
||||
superkmers := obisuperkmer.CLIExtractSuperKmers(sequences)
|
||||
|
||||
// Write output sequences
|
||||
obiconvert.CLIWriteBioSequences(superkmers, true)
|
||||
|
||||
// Wait for pipeline completion
|
||||
obiutils.WaitForLastPipe()
|
||||
}
|
||||
@@ -35,10 +35,12 @@ type KmerSetGroupBuilder struct {
|
||||
dir string
|
||||
k int
|
||||
m int
|
||||
n int // number of sets
|
||||
n int // number of NEW sets being built
|
||||
P int // number of partitions
|
||||
startIndex int // first set index (0 for new groups, existingN for appends)
|
||||
config builderConfig
|
||||
writers [][]*SkmWriter // [setIndex][partIndex]
|
||||
existing *KmerSetGroup // non-nil when appending to existing group
|
||||
writers [][]*SkmWriter // [setIndex][partIndex] (local index 0..n-1)
|
||||
mu [][]sync.Mutex // per-writer mutex for concurrent access
|
||||
closed bool
|
||||
}
|
||||
@@ -132,12 +134,89 @@ func NewKmerSetGroupBuilder(directory string, k, m, n, P int,
|
||||
m: m,
|
||||
n: n,
|
||||
P: P,
|
||||
startIndex: 0,
|
||||
config: config,
|
||||
writers: writers,
|
||||
mu: mutexes,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// AppendKmerSetGroupBuilder opens an existing KmerSetGroup and creates
|
||||
// a builder that adds n new sets starting from the existing set count.
|
||||
// The k, m, and partitions are inherited from the existing group.
|
||||
func AppendKmerSetGroupBuilder(directory string, n int, options ...BuilderOption) (*KmerSetGroupBuilder, error) {
|
||||
existing, err := OpenKmerSetGroup(directory)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("obikmer: open existing group: %w", err)
|
||||
}
|
||||
|
||||
if n < 1 {
|
||||
return nil, fmt.Errorf("obikmer: n must be >= 1, got %d", n)
|
||||
}
|
||||
|
||||
k := existing.K()
|
||||
m := existing.M()
|
||||
P := existing.Partitions()
|
||||
startIndex := existing.Size()
|
||||
|
||||
var config builderConfig
|
||||
for _, opt := range options {
|
||||
opt(&config)
|
||||
}
|
||||
|
||||
// Create build directory structure for new sets
|
||||
buildDir := filepath.Join(directory, ".build")
|
||||
for s := 0; s < n; s++ {
|
||||
setDir := filepath.Join(buildDir, fmt.Sprintf("set_%d", s))
|
||||
if err := os.MkdirAll(setDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create build dir: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Create SKM writers for new sets
|
||||
writers := make([][]*SkmWriter, n)
|
||||
mutexes := make([][]sync.Mutex, n)
|
||||
for s := 0; s < n; s++ {
|
||||
writers[s] = make([]*SkmWriter, P)
|
||||
mutexes[s] = make([]sync.Mutex, P)
|
||||
for p := 0; p < P; p++ {
|
||||
path := filepath.Join(buildDir, fmt.Sprintf("set_%d", s),
|
||||
fmt.Sprintf("part_%04d.skm", p))
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
for ss := 0; ss <= s; ss++ {
|
||||
for pp := 0; pp < P; pp++ {
|
||||
if writers[ss][pp] != nil {
|
||||
writers[ss][pp].Close()
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil, fmt.Errorf("obikmer: create skm writer: %w", err)
|
||||
}
|
||||
writers[s][p] = w
|
||||
}
|
||||
}
|
||||
|
||||
return &KmerSetGroupBuilder{
|
||||
dir: directory,
|
||||
k: k,
|
||||
m: m,
|
||||
n: n,
|
||||
P: P,
|
||||
startIndex: startIndex,
|
||||
config: config,
|
||||
existing: existing,
|
||||
writers: writers,
|
||||
mu: mutexes,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// StartIndex returns the first global set index for the new sets being built.
|
||||
// For new groups this is 0; for appends it is the existing group's Size().
|
||||
func (b *KmerSetGroupBuilder) StartIndex() int {
|
||||
return b.startIndex
|
||||
}
|
||||
|
||||
// AddSequence extracts super-kmers from a sequence and writes them
|
||||
// to the appropriate partition files for the given set.
|
||||
func (b *KmerSetGroupBuilder) AddSequence(setIndex int, seq *obiseq.BioSequence) {
|
||||
@@ -193,9 +272,10 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Create output directory structure
|
||||
// 2. Create output directory structure for new sets
|
||||
for s := 0; s < b.n; s++ {
|
||||
setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", s))
|
||||
globalIdx := b.startIndex + s
|
||||
setDir := filepath.Join(b.dir, fmt.Sprintf("set_%d", globalIdx))
|
||||
if err := os.MkdirAll(setDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create set dir: %w", err)
|
||||
}
|
||||
@@ -251,25 +331,45 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
||||
}
|
||||
|
||||
// 3. Build KmerSetGroup and write metadata
|
||||
totalCounts := make([]uint64, b.n)
|
||||
newCounts := make([]uint64, b.n)
|
||||
for s := 0; s < b.n; s++ {
|
||||
for p := 0; p < b.P; p++ {
|
||||
totalCounts[s] += counts[s][p]
|
||||
newCounts[s] += counts[s][p]
|
||||
}
|
||||
}
|
||||
|
||||
var ksg *KmerSetGroup
|
||||
|
||||
if b.existing != nil {
|
||||
// Append mode: extend existing group
|
||||
ksg = b.existing
|
||||
ksg.n += b.n
|
||||
ksg.setsIDs = append(ksg.setsIDs, make([]string, b.n)...)
|
||||
ksg.counts = append(ksg.counts, newCounts...)
|
||||
newMeta := make([]map[string]interface{}, b.n)
|
||||
for i := range newMeta {
|
||||
newMeta[i] = make(map[string]interface{})
|
||||
}
|
||||
ksg.setsMetadata = append(ksg.setsMetadata, newMeta...)
|
||||
} else {
|
||||
// New group
|
||||
setsIDs := make([]string, b.n)
|
||||
|
||||
ksg := &KmerSetGroup{
|
||||
setsMetadata := make([]map[string]interface{}, b.n)
|
||||
for i := range setsMetadata {
|
||||
setsMetadata[i] = make(map[string]interface{})
|
||||
}
|
||||
ksg = &KmerSetGroup{
|
||||
path: b.dir,
|
||||
k: b.k,
|
||||
m: b.m,
|
||||
partitions: b.P,
|
||||
n: b.n,
|
||||
setsIDs: setsIDs,
|
||||
counts: totalCounts,
|
||||
counts: newCounts,
|
||||
setsMetadata: setsMetadata,
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
}
|
||||
|
||||
if err := ksg.saveMetadata(); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: write metadata: %w", err)
|
||||
@@ -285,12 +385,14 @@ func (b *KmerSetGroupBuilder) Close() (*KmerSetGroup, error) {
|
||||
// finalizePartition processes a single partition: load SKM, extract k-mers,
|
||||
// sort, dedup/count, write KDI.
|
||||
func (b *KmerSetGroupBuilder) finalizePartition(setIdx, partIdx int, count *uint64) error {
|
||||
// setIdx is local (0..n-1); build dirs use local index, output dirs use global
|
||||
skmPath := filepath.Join(b.dir, ".build",
|
||||
fmt.Sprintf("set_%d", setIdx),
|
||||
fmt.Sprintf("part_%04d.skm", partIdx))
|
||||
|
||||
globalIdx := b.startIndex + setIdx
|
||||
kdiPath := filepath.Join(b.dir,
|
||||
fmt.Sprintf("set_%d", setIdx),
|
||||
fmt.Sprintf("set_%d", globalIdx),
|
||||
fmt.Sprintf("part_%04d.kdi", partIdx))
|
||||
|
||||
// Load super-kmers and extract canonical k-mers
|
||||
|
||||
@@ -2,9 +2,12 @@ package obikmer
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"iter"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidist"
|
||||
@@ -50,6 +53,7 @@ type KmerSetGroup struct {
|
||||
n int // number of sets N
|
||||
setsIDs []string // IDs of individual sets
|
||||
counts []uint64 // total k-mer count per set (sum over partitions)
|
||||
setsMetadata []map[string]interface{} // per-set user metadata
|
||||
Metadata map[string]interface{} // group-level user metadata
|
||||
}
|
||||
|
||||
@@ -63,6 +67,7 @@ type diskMetadata struct {
|
||||
Size int `toml:"size"`
|
||||
SetsIDs []string `toml:"sets_ids,omitempty"`
|
||||
Counts []uint64 `toml:"counts,omitempty"`
|
||||
SetsMetadata []map[string]interface{} `toml:"sets_metadata,omitempty"`
|
||||
UserMetadata map[string]interface{} `toml:"user_metadata,omitempty"`
|
||||
}
|
||||
|
||||
@@ -89,6 +94,7 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
|
||||
n: meta.Size,
|
||||
setsIDs: meta.SetsIDs,
|
||||
counts: meta.Counts,
|
||||
setsMetadata: meta.SetsMetadata,
|
||||
Metadata: meta.UserMetadata,
|
||||
}
|
||||
if ksg.Metadata == nil {
|
||||
@@ -97,6 +103,12 @@ func OpenKmerSetGroup(directory string) (*KmerSetGroup, error) {
|
||||
if ksg.setsIDs == nil {
|
||||
ksg.setsIDs = make([]string, ksg.n)
|
||||
}
|
||||
if ksg.setsMetadata == nil {
|
||||
ksg.setsMetadata = make([]map[string]interface{}, ksg.n)
|
||||
for i := range ksg.setsMetadata {
|
||||
ksg.setsMetadata[i] = make(map[string]interface{})
|
||||
}
|
||||
}
|
||||
if ksg.counts == nil {
|
||||
// Compute counts by scanning partitions
|
||||
ksg.counts = make([]uint64, ksg.n)
|
||||
@@ -133,6 +145,7 @@ func (ksg *KmerSetGroup) saveMetadata() error {
|
||||
Size: ksg.n,
|
||||
SetsIDs: ksg.setsIDs,
|
||||
Counts: ksg.counts,
|
||||
SetsMetadata: ksg.setsMetadata,
|
||||
UserMetadata: ksg.Metadata,
|
||||
}
|
||||
|
||||
@@ -578,3 +591,299 @@ func (ksg *KmerSetGroup) JaccardSimilarityMatrix() *obidist.DistMatrix {
|
||||
|
||||
return sm
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Set ID accessors
|
||||
// ==============================
|
||||
|
||||
// SetsIDs returns a copy of the per-set string identifiers.
|
||||
func (ksg *KmerSetGroup) SetsIDs() []string {
|
||||
out := make([]string, len(ksg.setsIDs))
|
||||
copy(out, ksg.setsIDs)
|
||||
return out
|
||||
}
|
||||
|
||||
// SetIDOf returns the string ID of the set at the given index.
|
||||
// Returns "" if index is out of range.
|
||||
func (ksg *KmerSetGroup) SetIDOf(index int) string {
|
||||
if index < 0 || index >= ksg.n {
|
||||
return ""
|
||||
}
|
||||
return ksg.setsIDs[index]
|
||||
}
|
||||
|
||||
// SetSetID sets the string ID of the set at the given index.
|
||||
func (ksg *KmerSetGroup) SetSetID(index int, id string) {
|
||||
if index >= 0 && index < ksg.n {
|
||||
ksg.setsIDs[index] = id
|
||||
}
|
||||
}
|
||||
|
||||
// IndexOfSetID returns the numeric index for a set ID, or -1 if not found.
|
||||
func (ksg *KmerSetGroup) IndexOfSetID(id string) int {
|
||||
for i, sid := range ksg.setsIDs {
|
||||
if sid == id {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// MatchSetIDs resolves glob patterns against set IDs and returns matching
|
||||
// indices sorted in ascending order. Uses path.Match for pattern matching
|
||||
// (supports *, ?, [...] patterns). Returns error if a pattern is malformed.
|
||||
func (ksg *KmerSetGroup) MatchSetIDs(patterns []string) ([]int, error) {
|
||||
seen := make(map[int]bool)
|
||||
for _, pattern := range patterns {
|
||||
for i, sid := range ksg.setsIDs {
|
||||
matched, err := path.Match(pattern, sid)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("obikmer: invalid glob pattern %q: %w", pattern, err)
|
||||
}
|
||||
if matched {
|
||||
seen[i] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
result := make([]int, 0, len(seen))
|
||||
for idx := range seen {
|
||||
result = append(result, idx)
|
||||
}
|
||||
sort.Ints(result)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Per-set metadata accessors
|
||||
// ==============================
|
||||
|
||||
// GetSetMetadata returns the value of a per-set metadata key.
|
||||
func (ksg *KmerSetGroup) GetSetMetadata(setIndex int, key string) (interface{}, bool) {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return nil, false
|
||||
}
|
||||
v, ok := ksg.setsMetadata[setIndex][key]
|
||||
return v, ok
|
||||
}
|
||||
|
||||
// SetSetMetadata sets a per-set metadata attribute.
|
||||
func (ksg *KmerSetGroup) SetSetMetadata(setIndex int, key string, value interface{}) {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return
|
||||
}
|
||||
if ksg.setsMetadata[setIndex] == nil {
|
||||
ksg.setsMetadata[setIndex] = make(map[string]interface{})
|
||||
}
|
||||
ksg.setsMetadata[setIndex][key] = value
|
||||
}
|
||||
|
||||
// DeleteSetMetadata removes a per-set metadata attribute.
|
||||
func (ksg *KmerSetGroup) DeleteSetMetadata(setIndex int, key string) {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return
|
||||
}
|
||||
delete(ksg.setsMetadata[setIndex], key)
|
||||
}
|
||||
|
||||
// AllSetMetadata returns a copy of all metadata for a given set.
|
||||
func (ksg *KmerSetGroup) AllSetMetadata(setIndex int) map[string]interface{} {
|
||||
if setIndex < 0 || setIndex >= ksg.n {
|
||||
return nil
|
||||
}
|
||||
out := make(map[string]interface{}, len(ksg.setsMetadata[setIndex]))
|
||||
for k, v := range ksg.setsMetadata[setIndex] {
|
||||
out[k] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Exported partition path and compatibility
|
||||
// ==============================
|
||||
|
||||
// PartitionPath returns the file path for partition partIndex of set setIndex.
|
||||
func (ksg *KmerSetGroup) PartitionPath(setIndex, partIndex int) string {
|
||||
return ksg.partitionPath(setIndex, partIndex)
|
||||
}
|
||||
|
||||
// IsCompatibleWith returns true if the other group has the same k, m, and partitions.
|
||||
func (ksg *KmerSetGroup) IsCompatibleWith(other *KmerSetGroup) bool {
|
||||
return ksg.k == other.k && ksg.m == other.m && ksg.partitions == other.partitions
|
||||
}
|
||||
|
||||
// ==============================
|
||||
// Set management operations
|
||||
// ==============================
|
||||
|
||||
// NewEmptyCompatible creates an empty KmerSetGroup at destDir with the same
|
||||
// k, m, and partitions as this group. The destination must not already exist.
|
||||
func (ksg *KmerSetGroup) NewEmptyCompatible(destDir string) (*KmerSetGroup, error) {
|
||||
if err := os.MkdirAll(destDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create directory: %w", err)
|
||||
}
|
||||
|
||||
dest := &KmerSetGroup{
|
||||
path: destDir,
|
||||
k: ksg.k,
|
||||
m: ksg.m,
|
||||
partitions: ksg.partitions,
|
||||
n: 0,
|
||||
setsIDs: []string{},
|
||||
counts: []uint64{},
|
||||
setsMetadata: []map[string]interface{}{},
|
||||
Metadata: make(map[string]interface{}),
|
||||
}
|
||||
|
||||
if err := dest.saveMetadata(); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: write metadata: %w", err)
|
||||
}
|
||||
|
||||
return dest, nil
|
||||
}
|
||||
|
||||
// RemoveSetByID removes the set with the given ID from the group.
|
||||
// It deletes the set directory, renumbers all subsequent sets, and
|
||||
// updates the metadata on disk.
|
||||
func (ksg *KmerSetGroup) RemoveSetByID(id string) error {
|
||||
idx := ksg.IndexOfSetID(id)
|
||||
if idx < 0 {
|
||||
return fmt.Errorf("obikmer: set ID %q not found", id)
|
||||
}
|
||||
|
||||
// Delete the set directory
|
||||
setDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", idx))
|
||||
if err := os.RemoveAll(setDir); err != nil {
|
||||
return fmt.Errorf("obikmer: remove set directory: %w", err)
|
||||
}
|
||||
|
||||
// Renumber subsequent sets
|
||||
for i := idx + 1; i < ksg.n; i++ {
|
||||
oldDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i))
|
||||
newDir := filepath.Join(ksg.path, fmt.Sprintf("set_%d", i-1))
|
||||
if err := os.Rename(oldDir, newDir); err != nil {
|
||||
return fmt.Errorf("obikmer: rename set_%d to set_%d: %w", i, i-1, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Update slices
|
||||
ksg.setsIDs = append(ksg.setsIDs[:idx], ksg.setsIDs[idx+1:]...)
|
||||
ksg.counts = append(ksg.counts[:idx], ksg.counts[idx+1:]...)
|
||||
ksg.setsMetadata = append(ksg.setsMetadata[:idx], ksg.setsMetadata[idx+1:]...)
|
||||
ksg.n--
|
||||
|
||||
return ksg.saveMetadata()
|
||||
}
|
||||
|
||||
// CopySetsByIDTo copies sets identified by their IDs into a KmerSetGroup
|
||||
// at destDir. If destDir does not exist, a new compatible empty group is
|
||||
// created. If it exists, compatibility (k, m, partitions) is checked.
|
||||
// If a set ID already exists in the destination, an error is returned
|
||||
// unless force is true (in which case the existing set is replaced).
|
||||
// Per-set metadata travels with the set.
|
||||
func (ksg *KmerSetGroup) CopySetsByIDTo(ids []string, destDir string, force bool) (*KmerSetGroup, error) {
|
||||
// Resolve source IDs to indices
|
||||
srcIndices := make([]int, len(ids))
|
||||
for i, id := range ids {
|
||||
idx := ksg.IndexOfSetID(id)
|
||||
if idx < 0 {
|
||||
return nil, fmt.Errorf("obikmer: source set ID %q not found", id)
|
||||
}
|
||||
srcIndices[i] = idx
|
||||
}
|
||||
|
||||
// Open or create destination
|
||||
var dest *KmerSetGroup
|
||||
metaPath := filepath.Join(destDir, "metadata.toml")
|
||||
if _, err := os.Stat(metaPath); err == nil {
|
||||
// Destination exists
|
||||
dest, err = OpenKmerSetGroup(destDir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("obikmer: open destination: %w", err)
|
||||
}
|
||||
if !ksg.IsCompatibleWith(dest) {
|
||||
return nil, fmt.Errorf("obikmer: incompatible groups: source (k=%d, m=%d, P=%d) vs dest (k=%d, m=%d, P=%d)",
|
||||
ksg.k, ksg.m, ksg.partitions, dest.k, dest.m, dest.partitions)
|
||||
}
|
||||
} else {
|
||||
// Create new destination
|
||||
var err error
|
||||
dest, err = ksg.NewEmptyCompatible(destDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// Copy each set
|
||||
for i, srcIdx := range srcIndices {
|
||||
srcID := ids[i]
|
||||
|
||||
// Check for ID conflict in destination
|
||||
existingIdx := dest.IndexOfSetID(srcID)
|
||||
if existingIdx >= 0 {
|
||||
if !force {
|
||||
return nil, fmt.Errorf("obikmer: set ID %q already exists in destination (use force to replace)", srcID)
|
||||
}
|
||||
// Force: remove existing set in destination
|
||||
if err := dest.RemoveSetByID(srcID); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: remove existing set %q in destination: %w", srcID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Destination set index = current dest size
|
||||
destIdx := dest.n
|
||||
|
||||
// Create destination set directory
|
||||
destSetDir := filepath.Join(destDir, fmt.Sprintf("set_%d", destIdx))
|
||||
if err := os.MkdirAll(destSetDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: create dest set dir: %w", err)
|
||||
}
|
||||
|
||||
// Copy all partition files
|
||||
for p := 0; p < ksg.partitions; p++ {
|
||||
srcPath := ksg.partitionPath(srcIdx, p)
|
||||
destPath := dest.partitionPath(destIdx, p)
|
||||
if err := copyFile(srcPath, destPath); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: copy partition %d of set %q: %w", p, srcID, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Update destination metadata
|
||||
dest.setsIDs = append(dest.setsIDs, srcID)
|
||||
dest.counts = append(dest.counts, ksg.counts[srcIdx])
|
||||
|
||||
// Copy per-set metadata
|
||||
srcMeta := ksg.AllSetMetadata(srcIdx)
|
||||
if srcMeta == nil {
|
||||
srcMeta = make(map[string]interface{})
|
||||
}
|
||||
dest.setsMetadata = append(dest.setsMetadata, srcMeta)
|
||||
dest.n++
|
||||
}
|
||||
|
||||
if err := dest.saveMetadata(); err != nil {
|
||||
return nil, fmt.Errorf("obikmer: save destination metadata: %w", err)
|
||||
}
|
||||
|
||||
return dest, nil
|
||||
}
|
||||
|
||||
// copyFile copies a file from src to dst.
|
||||
func copyFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
if _, err := io.Copy(out, in); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return out.Close()
|
||||
}
|
||||
|
||||
@@ -26,16 +26,11 @@ var __defaut_taxonomy_mutex__ sync.Mutex
|
||||
|
||||
type ArgumentParser func([]string) (*getoptions.GetOpt, []string)
|
||||
|
||||
func GenerateOptionParser(program string,
|
||||
documentation string,
|
||||
optionset ...func(*getoptions.GetOpt)) ArgumentParser {
|
||||
|
||||
options := getoptions.New()
|
||||
options.Self(program, documentation)
|
||||
options.SetMode(getoptions.Bundling)
|
||||
options.SetUnknownMode(getoptions.Fail)
|
||||
options.Bool("help", false, options.Alias("h", "?"))
|
||||
|
||||
// RegisterGlobalOptions registers the global options shared by all obitools
|
||||
// commands onto the given GetOpt instance. It does NOT register --help,
|
||||
// which must be handled by the caller (either as a Bool option or via
|
||||
// HelpCommand for subcommand-based parsers).
|
||||
func RegisterGlobalOptions(options *getoptions.GetOpt) {
|
||||
options.Bool("version", false,
|
||||
options.Description("Prints the version and exits."))
|
||||
|
||||
@@ -46,17 +41,10 @@ func GenerateOptionParser(program string,
|
||||
options.BoolVar(&_Pprof, "pprof", false,
|
||||
options.Description("Enable pprof server. Look at the log for details."))
|
||||
|
||||
// options.IntVar(&_ParallelWorkers, "workers", _ParallelWorkers,
|
||||
// options.Alias("w"),
|
||||
// options.Description("Number of parallele threads computing the result"))
|
||||
|
||||
options.IntVar(obidefault.MaxCPUPtr(), "max-cpu", obidefault.MaxCPU(),
|
||||
options.GetEnv("OBIMAXCPU"),
|
||||
options.Description("Number of parallele threads computing the result"))
|
||||
|
||||
// options.BoolVar(&_Pprof, "force-one-cpu", false,
|
||||
// options.Description("Force to use only one cpu core for parallel processing"))
|
||||
|
||||
options.IntVar(&_PprofMudex, "pprof-mutex", _PprofMudex,
|
||||
options.GetEnv("OBIPPROFMUTEX"),
|
||||
options.Description("Enable profiling of mutex lock."))
|
||||
@@ -77,19 +65,22 @@ func GenerateOptionParser(program string,
|
||||
options.GetEnv("OBIWARNING"),
|
||||
options.Description("Stop printing of the warning message"),
|
||||
)
|
||||
|
||||
for _, o := range optionset {
|
||||
o(options)
|
||||
}
|
||||
|
||||
return func(args []string) (*getoptions.GetOpt, []string) {
|
||||
|
||||
remaining, err := options.Parse(args[1:])
|
||||
|
||||
// ProcessParsedOptions handles the post-parse logic common to all obitools
|
||||
// commands: help, version, debug, pprof, taxonomy, cpu configuration, etc.
|
||||
// It receives the GetOpt instance and the parse error (if any).
|
||||
func ProcessParsedOptions(options *getoptions.GetOpt, parseErr error) {
|
||||
// Note: "help" may not be registered as a Bool (e.g. when using HelpCommand
|
||||
// for subcommand-based parsers). Only check if it won't panic.
|
||||
// We use a recover guard to be safe.
|
||||
func() {
|
||||
defer func() { recover() }()
|
||||
if options.Called("help") {
|
||||
fmt.Fprint(os.Stderr, options.Help())
|
||||
os.Exit(0)
|
||||
}
|
||||
}()
|
||||
|
||||
if options.Called("version") {
|
||||
fmt.Fprintf(os.Stderr, "OBITools %s\n", VersionString())
|
||||
@@ -107,7 +98,6 @@ func GenerateOptionParser(program string,
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot load default taxonomy: %v", err)
|
||||
|
||||
}
|
||||
|
||||
taxonomy.SetAsDefault()
|
||||
@@ -146,32 +136,14 @@ func GenerateOptionParser(program string,
|
||||
}
|
||||
|
||||
// Handle user errors
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", err)
|
||||
if parseErr != nil {
|
||||
fmt.Fprintf(os.Stderr, "ERROR: %s\n\n", parseErr)
|
||||
fmt.Fprint(os.Stderr, options.Help(getoptions.HelpSynopsis))
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// // Setup the maximum number of CPU usable by the program
|
||||
// if obidefault.MaxCPU() == 1 {
|
||||
// log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded")
|
||||
// log.Warn("The number of CPU requested has been set to 2")
|
||||
// obidefault.SetMaxCPU(2)
|
||||
// }
|
||||
|
||||
// if options.Called("force-one-cpu") {
|
||||
// log.Warn("Limitating the Maximum number of CPU to 1 is not recommanded")
|
||||
// log.Warn("The number of CPU has been forced to 1")
|
||||
// log.Warn("This can lead to unexpected behavior")
|
||||
// obidefault.SetMaxCPU(1)
|
||||
// }
|
||||
|
||||
runtime.GOMAXPROCS(obidefault.MaxCPU())
|
||||
|
||||
// if options.Called("max-cpu") || options.Called("force-one-cpu") {
|
||||
// log.Printf("CPU number limited to %d", obidefault.MaxCPU())
|
||||
// }
|
||||
|
||||
if options.Called("max-cpu") {
|
||||
log.Printf("CPU number limited to %d", obidefault.MaxCPU())
|
||||
}
|
||||
@@ -182,14 +154,30 @@ func GenerateOptionParser(program string,
|
||||
|
||||
log.Printf("Number of workers set %d", obidefault.ParallelWorkers())
|
||||
|
||||
// if options.Called("workers") {
|
||||
|
||||
// }
|
||||
|
||||
if options.Called("solexa") {
|
||||
obidefault.SetReadQualitiesShift(64)
|
||||
}
|
||||
}
|
||||
|
||||
func GenerateOptionParser(program string,
|
||||
documentation string,
|
||||
optionset ...func(*getoptions.GetOpt)) ArgumentParser {
|
||||
|
||||
options := getoptions.New()
|
||||
options.Self(program, documentation)
|
||||
options.SetMode(getoptions.Bundling)
|
||||
options.SetUnknownMode(getoptions.Fail)
|
||||
options.Bool("help", false, options.Alias("h", "?"))
|
||||
|
||||
RegisterGlobalOptions(options)
|
||||
|
||||
for _, o := range optionset {
|
||||
o(options)
|
||||
}
|
||||
|
||||
return func(args []string) (*getoptions.GetOpt, []string) {
|
||||
remaining, err := options.Parse(args[1:])
|
||||
ProcessParsedOptions(options, err)
|
||||
return options, remaining
|
||||
}
|
||||
}
|
||||
|
||||
43
pkg/obioptions/subcommand.go
Normal file
43
pkg/obioptions/subcommand.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package obioptions
|
||||
|
||||
import (
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// GenerateSubcommandParser creates an option parser that supports subcommands
|
||||
// via go-getoptions' NewCommand/SetCommandFn/Dispatch API.
|
||||
//
|
||||
// The setup function receives the root *GetOpt and should register subcommands
|
||||
// using opt.NewCommand(). Global options (--debug, --max-cpu, etc.) are
|
||||
// registered before setup is called and are inherited by all subcommands.
|
||||
//
|
||||
// Returns the root *GetOpt (needed for Dispatch) and an ArgumentParser
|
||||
// that handles parsing and post-parse processing.
|
||||
func GenerateSubcommandParser(
|
||||
program string,
|
||||
documentation string,
|
||||
setup func(opt *getoptions.GetOpt),
|
||||
) (*getoptions.GetOpt, ArgumentParser) {
|
||||
|
||||
options := getoptions.New()
|
||||
options.Self(program, documentation)
|
||||
options.SetMode(getoptions.Bundling)
|
||||
options.SetUnknownMode(getoptions.Fail)
|
||||
|
||||
// Register global options (inherited by all subcommands)
|
||||
RegisterGlobalOptions(options)
|
||||
|
||||
// Let the caller register subcommands
|
||||
setup(options)
|
||||
|
||||
// Add automatic help subcommand (must be after all commands)
|
||||
options.HelpCommand("help", options.Description("Show help for a command"))
|
||||
|
||||
parser := func(args []string) (*getoptions.GetOpt, []string) {
|
||||
remaining, err := options.Parse(args[1:])
|
||||
ProcessParsedOptions(options, err)
|
||||
return options, remaining
|
||||
}
|
||||
|
||||
return options, parser
|
||||
}
|
||||
55
pkg/obitools/obik/cp.go
Normal file
55
pkg/obitools/obik/cp.go
Normal file
@@ -0,0 +1,55 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
func runCp(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 2 {
|
||||
return fmt.Errorf("usage: obik cp [--set PATTERN]... [--force] <source_index> <dest_index>")
|
||||
}
|
||||
|
||||
srcDir := args[0]
|
||||
destDir := args[1]
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(srcDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open source kmer index: %w", err)
|
||||
}
|
||||
|
||||
// Resolve set patterns
|
||||
patterns := CLISetPatterns()
|
||||
var ids []string
|
||||
if len(patterns) > 0 {
|
||||
indices, err := ksg.MatchSetIDs(patterns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(indices) == 0 {
|
||||
return fmt.Errorf("no sets match the given patterns")
|
||||
}
|
||||
ids = make([]string, len(indices))
|
||||
for i, idx := range indices {
|
||||
ids[i] = ksg.SetIDOf(idx)
|
||||
}
|
||||
} else {
|
||||
// Copy all sets
|
||||
ids = ksg.SetsIDs()
|
||||
}
|
||||
|
||||
log.Infof("Copying %d set(s) from %s to %s", len(ids), srcDir, destDir)
|
||||
|
||||
dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Infof("Destination now has %d set(s)", dest.Size())
|
||||
return nil
|
||||
}
|
||||
110
pkg/obitools/obik/index.go
Normal file
110
pkg/obitools/obik/index.go
Normal file
@@ -0,0 +1,110 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
outDir := CLIOutputDirectory()
|
||||
if outDir == "" || outDir == "-" {
|
||||
return fmt.Errorf("--out option is required and must specify a directory path")
|
||||
}
|
||||
|
||||
k := CLIKmerSize()
|
||||
if k < 2 || k > 31 {
|
||||
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||
}
|
||||
|
||||
m := CLIMinimizerSize()
|
||||
|
||||
minOcc := CLIMinOccurrence()
|
||||
if minOcc < 1 {
|
||||
return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc)
|
||||
}
|
||||
|
||||
// Build options
|
||||
var opts []obikmer.BuilderOption
|
||||
if minOcc > 1 {
|
||||
opts = append(opts, obikmer.WithMinFrequency(minOcc))
|
||||
}
|
||||
|
||||
// Determine whether to append to existing group or create new
|
||||
var builder *obikmer.KmerSetGroupBuilder
|
||||
var err error
|
||||
metaPath := filepath.Join(outDir, "metadata.toml")
|
||||
if _, statErr := os.Stat(metaPath); statErr == nil {
|
||||
// Existing group: append
|
||||
log.Infof("Appending to existing kmer index at %s", outDir)
|
||||
builder, err = obikmer.AppendKmerSetGroupBuilder(outDir, 1, opts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open existing kmer index for appending: %w", err)
|
||||
}
|
||||
} else {
|
||||
// New group
|
||||
log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
|
||||
builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create kmer index builder: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Read and process sequences
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open sequence files: %w", err)
|
||||
}
|
||||
|
||||
seqCount := 0
|
||||
for sequences.Next() {
|
||||
batch := sequences.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
builder.AddSequence(0, seq)
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount)
|
||||
|
||||
// Finalize
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to finalize kmer index: %w", err)
|
||||
}
|
||||
|
||||
// Apply index-id to the new set
|
||||
newSetIdx := builder.StartIndex()
|
||||
if id := CLIIndexId(); id != "" {
|
||||
ksg.SetSetID(newSetIdx, id)
|
||||
}
|
||||
|
||||
// Apply group-level tags (-S)
|
||||
for key, value := range CLISetTag() {
|
||||
ksg.SetAttribute(key, value)
|
||||
}
|
||||
|
||||
// Apply per-set tags (-T) to the new set
|
||||
for key, value := range _setMetaTags {
|
||||
ksg.SetSetMetadata(newSetIdx, key, value)
|
||||
}
|
||||
|
||||
if minOcc > 1 {
|
||||
ksg.SetAttribute("min_occurrence", minOcc)
|
||||
}
|
||||
|
||||
if err := ksg.SaveMetadata(); err != nil {
|
||||
return fmt.Errorf("failed to save metadata: %w", err)
|
||||
}
|
||||
|
||||
log.Infof("Index contains %d k-mers for set %d in %s", ksg.Len(newSetIdx), newSetIdx, outDir)
|
||||
log.Info("Done.")
|
||||
return nil
|
||||
}
|
||||
@@ -1,39 +1,80 @@
|
||||
package obilowmask
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// MaskingMode defines how to handle low-complexity regions
|
||||
type MaskingMode int
|
||||
|
||||
const (
|
||||
Mask MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters
|
||||
Split // Split mode: split sequence into high-complexity fragments
|
||||
Extract
|
||||
MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters
|
||||
SplitMode // Split sequence into high-complexity fragments
|
||||
ExtractMode // Extract low-complexity fragments
|
||||
)
|
||||
|
||||
// LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
|
||||
//
|
||||
// Algorithm principle:
|
||||
// Calculate the normalized entropy of each k-mer at different scales (wordSize = 1 to level_max).
|
||||
// K-mers with entropy below the threshold are masked.
|
||||
//
|
||||
// Parameters:
|
||||
// - kmer_size: size of the sliding window for entropy calculation
|
||||
// - level_max: maximum word size used for entropy calculation (finest scale)
|
||||
// - threshold: normalized entropy threshold below which masking occurs (between 0 and 1)
|
||||
// - mode: Mask (masking) or Split (splitting)
|
||||
// - maskChar: character used for masking (typically 'n' or 'N')
|
||||
//
|
||||
// Returns: a SeqWorker function that can be applied to each sequence
|
||||
func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
|
||||
// Lowmask-specific option variables (separate from index/super kmer-size).
|
||||
var _lowmaskKmerSize = 31
|
||||
var _lowmaskLevelMax = 6
|
||||
var _lowmaskThreshold = 0.5
|
||||
var _lowmaskSplitMode = false
|
||||
var _lowmaskLowMode = false
|
||||
var _lowmaskMaskChar = "."
|
||||
|
||||
// LowMaskOptionSet registers options specific to low-complexity masking.
|
||||
func LowMaskOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_lowmaskKmerSize, "kmer-size", _lowmaskKmerSize,
|
||||
options.Description("Size of the kmer considered to estimate entropy."))
|
||||
|
||||
options.IntVar(&_lowmaskLevelMax, "entropy-size", _lowmaskLevelMax,
|
||||
options.Description("Maximum word size considered for entropy estimate."))
|
||||
|
||||
options.Float64Var(&_lowmaskThreshold, "threshold", _lowmaskThreshold,
|
||||
options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
|
||||
|
||||
options.BoolVar(&_lowmaskSplitMode, "split-mode", _lowmaskSplitMode,
|
||||
options.Description("Split sequences to remove masked regions."))
|
||||
|
||||
options.BoolVar(&_lowmaskLowMode, "low-mode", _lowmaskLowMode,
|
||||
options.Description("Extract only low-complexity regions."))
|
||||
|
||||
options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar,
|
||||
options.Description("Character used to mask low complexity regions."))
|
||||
}
|
||||
|
||||
func lowmaskMaskingMode() MaskingMode {
|
||||
switch {
|
||||
case _lowmaskLowMode:
|
||||
return ExtractMode
|
||||
case _lowmaskSplitMode:
|
||||
return SplitMode
|
||||
default:
|
||||
return MaskMode
|
||||
}
|
||||
}
|
||||
|
||||
func lowmaskMaskingChar() byte {
|
||||
mask := strings.TrimSpace(_lowmaskMaskChar)
|
||||
if len(mask) != 1 {
|
||||
log.Fatalf("--masking-char option accepts a single character, not %s", mask)
|
||||
}
|
||||
return []byte(mask)[0]
|
||||
}
|
||||
|
||||
// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
|
||||
func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
|
||||
|
||||
nLogN := make([]float64, kmer_size+1)
|
||||
for i := 1; i <= kmer_size; i++ {
|
||||
@@ -87,6 +128,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
data[i] = deque[0].value
|
||||
}
|
||||
}
|
||||
|
||||
emaxValues := make([]float64, level_max+1)
|
||||
logNwords := make([]float64, level_max+1)
|
||||
for ws := 1; ws <= level_max; ws++ {
|
||||
@@ -329,7 +371,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
|
||||
maskPositions := maskAmbiguities(bseq)
|
||||
|
||||
mask := make([]int, len(bseq))
|
||||
maskFlags := make([]int, len(bseq))
|
||||
entropies := make([]float64, len(bseq))
|
||||
for i := range entropies {
|
||||
entropies[i] = 4.0
|
||||
@@ -343,7 +385,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
|
||||
for i := range bseq {
|
||||
v := level_max
|
||||
mask[i] = v
|
||||
maskFlags[i] = v
|
||||
}
|
||||
|
||||
for ws := level_max - 1; ws > 0; ws-- {
|
||||
@@ -351,7 +393,7 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
for i, e2 := range entropies2 {
|
||||
if e2 < entropies[i] {
|
||||
entropies[i] = e2
|
||||
mask[i] = ws
|
||||
maskFlags[i] = ws
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -367,39 +409,49 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
remove[i] = e <= threshold
|
||||
}
|
||||
|
||||
sequence.SetAttribute("mask", mask)
|
||||
sequence.SetAttribute("mask", maskFlags)
|
||||
sequence.SetAttribute("Entropies", entropies)
|
||||
|
||||
switch mode {
|
||||
case Mask:
|
||||
case MaskMode:
|
||||
return applyMaskMode(sequence, remove, maskChar)
|
||||
case Split:
|
||||
case SplitMode:
|
||||
return selectunmasked(sequence, remove)
|
||||
case Extract:
|
||||
case ExtractMode:
|
||||
return selectMasked(sequence, remove)
|
||||
}
|
||||
return nil, fmt.Errorf("Unknown mode %d", mode)
|
||||
return nil, fmt.Errorf("unknown mode %d", mode)
|
||||
}
|
||||
|
||||
return masking
|
||||
}
|
||||
|
||||
// CLISequenceEntropyMasker creates an iterator that applies entropy masking
|
||||
// to all sequences in an input iterator.
|
||||
//
|
||||
// Uses command-line parameters to configure the worker.
|
||||
func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
var newIter obiiter.IBioSequence
|
||||
// runLowmask implements the "obik lowmask" subcommand.
|
||||
// It masks low-complexity regions in DNA sequences using entropy-based detection.
|
||||
func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
kmerSize := _lowmaskKmerSize
|
||||
levelMax := _lowmaskLevelMax
|
||||
threshold := _lowmaskThreshold
|
||||
mode := lowmaskMaskingMode()
|
||||
maskChar := lowmaskMaskingChar()
|
||||
|
||||
worker := LowMaskWorker(
|
||||
CLIKmerSize(),
|
||||
CLILevelMax(),
|
||||
CLIThreshold(),
|
||||
CLIMaskingMode(),
|
||||
CLIMaskingChar(),
|
||||
)
|
||||
log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold)
|
||||
|
||||
newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
|
||||
|
||||
return newIter.FilterEmpty()
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open sequence files: %w", err)
|
||||
}
|
||||
|
||||
worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar)
|
||||
|
||||
masked := sequences.MakeIWorker(
|
||||
worker,
|
||||
false,
|
||||
obidefault.ParallelWorkers(),
|
||||
).FilterEmpty()
|
||||
|
||||
obiconvert.CLIWriteBioSequences(masked, true)
|
||||
obiutils.WaitForLastPipe()
|
||||
|
||||
return nil
|
||||
}
|
||||
96
pkg/obitools/obik/ls.go
Normal file
96
pkg/obitools/obik/ls.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
type setEntry struct {
|
||||
Index int `json:"index" yaml:"index"`
|
||||
ID string `json:"id" yaml:"id"`
|
||||
Count uint64 `json:"count" yaml:"count"`
|
||||
}
|
||||
|
||||
func runLs(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 1 {
|
||||
return fmt.Errorf("usage: obik ls [options] <index_directory>")
|
||||
}
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(args[0])
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open kmer index: %w", err)
|
||||
}
|
||||
|
||||
// Determine which sets to show
|
||||
patterns := CLISetPatterns()
|
||||
var indices []int
|
||||
if len(patterns) > 0 {
|
||||
indices, err = ksg.MatchSetIDs(patterns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
indices = make([]int, ksg.Size())
|
||||
for i := range indices {
|
||||
indices[i] = i
|
||||
}
|
||||
}
|
||||
|
||||
entries := make([]setEntry, len(indices))
|
||||
for i, idx := range indices {
|
||||
entries[i] = setEntry{
|
||||
Index: idx,
|
||||
ID: ksg.SetIDOf(idx),
|
||||
Count: ksg.Len(idx),
|
||||
}
|
||||
}
|
||||
|
||||
format := CLIOutFormat()
|
||||
switch format {
|
||||
case "json":
|
||||
return outputLsJSON(entries)
|
||||
case "yaml":
|
||||
return outputLsYAML(entries)
|
||||
case "csv":
|
||||
return outputLsCSV(entries)
|
||||
default:
|
||||
return outputLsCSV(entries)
|
||||
}
|
||||
}
|
||||
|
||||
func outputLsCSV(entries []setEntry) error {
|
||||
fmt.Println("index,id,count")
|
||||
for _, e := range entries {
|
||||
// Escape commas in ID if needed
|
||||
id := e.ID
|
||||
if strings.ContainsAny(id, ",\"") {
|
||||
id = "\"" + strings.ReplaceAll(id, "\"", "\"\"") + "\""
|
||||
}
|
||||
fmt.Printf("%d,%s,%d\n", e.Index, id, e.Count)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputLsJSON(entries []setEntry) error {
|
||||
data, err := json.MarshalIndent(entries, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println(string(data))
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputLsYAML(entries []setEntry) error {
|
||||
data, err := yaml.Marshal(entries)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Print(string(data))
|
||||
return nil
|
||||
}
|
||||
63
pkg/obitools/obik/mv.go
Normal file
63
pkg/obitools/obik/mv.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
func runMv(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 2 {
|
||||
return fmt.Errorf("usage: obik mv [--set PATTERN]... [--force] <source_index> <dest_index>")
|
||||
}
|
||||
|
||||
srcDir := args[0]
|
||||
destDir := args[1]
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(srcDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open source kmer index: %w", err)
|
||||
}
|
||||
|
||||
// Resolve set patterns
|
||||
patterns := CLISetPatterns()
|
||||
var ids []string
|
||||
if len(patterns) > 0 {
|
||||
indices, err := ksg.MatchSetIDs(patterns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(indices) == 0 {
|
||||
return fmt.Errorf("no sets match the given patterns")
|
||||
}
|
||||
ids = make([]string, len(indices))
|
||||
for i, idx := range indices {
|
||||
ids[i] = ksg.SetIDOf(idx)
|
||||
}
|
||||
} else {
|
||||
// Move all sets
|
||||
ids = ksg.SetsIDs()
|
||||
}
|
||||
|
||||
log.Infof("Moving %d set(s) from %s to %s", len(ids), srcDir, destDir)
|
||||
|
||||
// Copy first
|
||||
dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Remove from source (in reverse order to avoid renumbering issues)
|
||||
for i := len(ids) - 1; i >= 0; i-- {
|
||||
if err := ksg.RemoveSetByID(ids[i]); err != nil {
|
||||
return fmt.Errorf("failed to remove set %q from source after copy: %w", ids[i], err)
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Destination now has %d set(s), source has %d set(s)", dest.Size(), ksg.Size())
|
||||
return nil
|
||||
}
|
||||
64
pkg/obitools/obik/obik.go
Normal file
64
pkg/obitools/obik/obik.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// OptionSet registers all obik subcommands on the root GetOpt.
|
||||
func OptionSet(opt *getoptions.GetOpt) {
|
||||
// index: build or extend a kmer index from sequence files
|
||||
indexCmd := opt.NewCommand("index", "Build a disk-based kmer index from sequence files")
|
||||
obiconvert.InputOptionSet(indexCmd)
|
||||
obiconvert.OutputModeOptionSet(indexCmd, false)
|
||||
KmerIndexOptionSet(indexCmd)
|
||||
indexCmd.StringMapVar(&_setMetaTags, "tag", 1, 1,
|
||||
indexCmd.Alias("T"),
|
||||
indexCmd.ArgName("KEY=VALUE"),
|
||||
indexCmd.Description("Per-set metadata tag (repeatable)."))
|
||||
indexCmd.SetCommandFn(runIndex)
|
||||
|
||||
// ls: list sets in a kmer index
|
||||
lsCmd := opt.NewCommand("ls", "List sets in a kmer index")
|
||||
OutputFormatOptionSet(lsCmd)
|
||||
SetSelectionOptionSet(lsCmd)
|
||||
lsCmd.SetCommandFn(runLs)
|
||||
|
||||
// summary: detailed statistics
|
||||
summaryCmd := opt.NewCommand("summary", "Show detailed statistics of a kmer index")
|
||||
OutputFormatOptionSet(summaryCmd)
|
||||
summaryCmd.BoolVar(&_jaccard, "jaccard", false,
|
||||
summaryCmd.Description("Compute and display pairwise Jaccard distance matrix."))
|
||||
summaryCmd.SetCommandFn(runSummary)
|
||||
|
||||
// cp: copy sets between indices
|
||||
cpCmd := opt.NewCommand("cp", "Copy sets between kmer indices")
|
||||
SetSelectionOptionSet(cpCmd)
|
||||
ForceOptionSet(cpCmd)
|
||||
cpCmd.SetCommandFn(runCp)
|
||||
|
||||
// mv: move sets between indices
|
||||
mvCmd := opt.NewCommand("mv", "Move sets between kmer indices")
|
||||
SetSelectionOptionSet(mvCmd)
|
||||
ForceOptionSet(mvCmd)
|
||||
mvCmd.SetCommandFn(runMv)
|
||||
|
||||
// rm: remove sets from an index
|
||||
rmCmd := opt.NewCommand("rm", "Remove sets from a kmer index")
|
||||
SetSelectionOptionSet(rmCmd)
|
||||
rmCmd.SetCommandFn(runRm)
|
||||
|
||||
// super: extract super k-mers from sequences
|
||||
superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files")
|
||||
obiconvert.InputOptionSet(superCmd)
|
||||
obiconvert.OutputOptionSet(superCmd)
|
||||
SuperKmerOptionSet(superCmd)
|
||||
superCmd.SetCommandFn(runSuper)
|
||||
|
||||
// lowmask: mask low-complexity regions
|
||||
lowmaskCmd := opt.NewCommand("lowmask", "Mask low-complexity regions in sequences using entropy")
|
||||
obiconvert.InputOptionSet(lowmaskCmd)
|
||||
obiconvert.OutputOptionSet(lowmaskCmd)
|
||||
LowMaskOptionSet(lowmaskCmd)
|
||||
lowmaskCmd.SetCommandFn(runLowmask)
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package obikindex
|
||||
package obik
|
||||
|
||||
import (
|
||||
"strings"
|
||||
@@ -11,7 +11,27 @@ import (
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Private variables for storing option values
|
||||
// Output format flags
|
||||
var _jsonOutput bool
|
||||
var _csvOutput bool
|
||||
var _yamlOutput bool
|
||||
|
||||
// Set selection flags
|
||||
var _setPatterns []string
|
||||
|
||||
// Force flag
|
||||
var _force bool
|
||||
|
||||
// Jaccard flag
|
||||
var _jaccard bool
|
||||
|
||||
// Per-set tags for index subcommand
|
||||
var _setMetaTags = make(map[string]string, 0)
|
||||
|
||||
// ==============================
|
||||
// Kmer index building options (moved from obikindex)
|
||||
// ==============================
|
||||
|
||||
var _kmerSize = 31
|
||||
var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
|
||||
var _indexId = ""
|
||||
@@ -39,7 +59,7 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringMapVar(&_setTag, "set-tag", 1, 1,
|
||||
options.Alias("S"),
|
||||
options.ArgName("KEY=VALUE"),
|
||||
options.Description("Adds a metadata attribute KEY with value VALUE to the index."))
|
||||
options.Description("Adds a group-level metadata attribute KEY with value VALUE."))
|
||||
|
||||
options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
|
||||
options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
|
||||
@@ -48,23 +68,12 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
||||
options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every option declared for
|
||||
// the obikindex command.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.InputOptionSet(options)
|
||||
obiconvert.OutputModeOptionSet(options, false)
|
||||
KmerIndexOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the k-mer size.
|
||||
func CLIKmerSize() int {
|
||||
return _kmerSize
|
||||
}
|
||||
|
||||
// CLIMinimizerSize returns the effective minimizer size.
|
||||
// If -1 (auto), computes ceil(k / 2.5) then applies constraints:
|
||||
// - minimum: ceil(log(nworkers) / log(4))
|
||||
// - maximum: k - 1
|
||||
func CLIMinimizerSize() int {
|
||||
m := _minimizerSize
|
||||
if m < 0 {
|
||||
@@ -95,7 +104,7 @@ func CLIMetadataFormat() obikmer.MetadataFormat {
|
||||
}
|
||||
}
|
||||
|
||||
// CLISetTag returns the metadata key=value pairs.
|
||||
// CLISetTag returns the group-level metadata key=value pairs.
|
||||
func CLISetTag() map[string]string {
|
||||
return _setTag
|
||||
}
|
||||
@@ -129,3 +138,52 @@ func SetMinimizerSize(m int) {
|
||||
func SetMinOccurrence(n int) {
|
||||
_minOccurrence = n
|
||||
}
|
||||
|
||||
// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
|
||||
func OutputFormatOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_jsonOutput, "json-output", false,
|
||||
options.Description("Print results as JSON."))
|
||||
options.BoolVar(&_csvOutput, "csv-output", false,
|
||||
options.Description("Print results as CSV."))
|
||||
options.BoolVar(&_yamlOutput, "yaml-output", false,
|
||||
options.Description("Print results as YAML."))
|
||||
}
|
||||
|
||||
// CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text".
|
||||
func CLIOutFormat() string {
|
||||
if _jsonOutput {
|
||||
return "json"
|
||||
}
|
||||
if _csvOutput {
|
||||
return "csv"
|
||||
}
|
||||
if _yamlOutput {
|
||||
return "yaml"
|
||||
}
|
||||
return "text"
|
||||
}
|
||||
|
||||
// SetSelectionOptionSet registers --set <glob_pattern> (repeatable).
|
||||
func SetSelectionOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringSliceVar(&_setPatterns, "set", 1, 1,
|
||||
options.Alias("s"),
|
||||
options.ArgName("PATTERN"),
|
||||
options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...])."))
|
||||
}
|
||||
|
||||
// CLISetPatterns returns the --set patterns provided by the user.
|
||||
func CLISetPatterns() []string {
|
||||
return _setPatterns
|
||||
}
|
||||
|
||||
// ForceOptionSet registers --force / -f.
|
||||
func ForceOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_force, "force", false,
|
||||
options.Alias("f"),
|
||||
options.Description("Force operation even if set ID already exists in destination."))
|
||||
}
|
||||
|
||||
// CLIForce returns whether --force was specified.
|
||||
func CLIForce() bool {
|
||||
return _force
|
||||
}
|
||||
56
pkg/obitools/obik/rm.go
Normal file
56
pkg/obitools/obik/rm.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
func runRm(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 1 {
|
||||
return fmt.Errorf("usage: obik rm --set PATTERN [--set PATTERN]... <index_directory>")
|
||||
}
|
||||
|
||||
patterns := CLISetPatterns()
|
||||
if len(patterns) == 0 {
|
||||
return fmt.Errorf("--set is required (specify which sets to remove)")
|
||||
}
|
||||
|
||||
indexDir := args[0]
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(indexDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open kmer index: %w", err)
|
||||
}
|
||||
|
||||
indices, err := ksg.MatchSetIDs(patterns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(indices) == 0 {
|
||||
return fmt.Errorf("no sets match the given patterns")
|
||||
}
|
||||
|
||||
// Collect IDs before removal (indices shift as we remove)
|
||||
ids := make([]string, len(indices))
|
||||
for i, idx := range indices {
|
||||
ids[i] = ksg.SetIDOf(idx)
|
||||
}
|
||||
|
||||
log.Infof("Removing %d set(s) from %s", len(ids), indexDir)
|
||||
|
||||
// Remove in reverse order to avoid renumbering issues
|
||||
for i := len(ids) - 1; i >= 0; i-- {
|
||||
if err := ksg.RemoveSetByID(ids[i]); err != nil {
|
||||
return fmt.Errorf("failed to remove set %q: %w", ids[i], err)
|
||||
}
|
||||
log.Infof("Removed set %q", ids[i])
|
||||
}
|
||||
|
||||
log.Infof("Index now has %d set(s)", ksg.Size())
|
||||
return nil
|
||||
}
|
||||
148
pkg/obitools/obik/summary.go
Normal file
148
pkg/obitools/obik/summary.go
Normal file
@@ -0,0 +1,148 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
type setSummary struct {
|
||||
Index int `json:"index" yaml:"index"`
|
||||
ID string `json:"id" yaml:"id"`
|
||||
Count uint64 `json:"count" yaml:"count"`
|
||||
DiskSize int64 `json:"disk_bytes" yaml:"disk_bytes"`
|
||||
Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type groupSummary struct {
|
||||
Path string `json:"path" yaml:"path"`
|
||||
ID string `json:"id,omitempty" yaml:"id,omitempty"`
|
||||
K int `json:"k" yaml:"k"`
|
||||
M int `json:"m" yaml:"m"`
|
||||
Partitions int `json:"partitions" yaml:"partitions"`
|
||||
TotalSets int `json:"total_sets" yaml:"total_sets"`
|
||||
TotalKmers uint64 `json:"total_kmers" yaml:"total_kmers"`
|
||||
TotalDisk int64 `json:"total_disk_bytes" yaml:"total_disk_bytes"`
|
||||
Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
|
||||
Sets []setSummary `json:"sets" yaml:"sets"`
|
||||
Jaccard [][]float64 `json:"jaccard,omitempty" yaml:"jaccard,omitempty"`
|
||||
}
|
||||
|
||||
func runSummary(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 1 {
|
||||
return fmt.Errorf("usage: obik summary [options] <index_directory>")
|
||||
}
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(args[0])
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open kmer index: %w", err)
|
||||
}
|
||||
|
||||
summary := groupSummary{
|
||||
Path: ksg.Path(),
|
||||
ID: ksg.Id(),
|
||||
K: ksg.K(),
|
||||
M: ksg.M(),
|
||||
Partitions: ksg.Partitions(),
|
||||
TotalSets: ksg.Size(),
|
||||
TotalKmers: ksg.Len(),
|
||||
Metadata: ksg.Metadata,
|
||||
Sets: make([]setSummary, ksg.Size()),
|
||||
}
|
||||
|
||||
var totalDisk int64
|
||||
for i := 0; i < ksg.Size(); i++ {
|
||||
diskSize := computeSetDiskSize(ksg, i)
|
||||
totalDisk += diskSize
|
||||
summary.Sets[i] = setSummary{
|
||||
Index: i,
|
||||
ID: ksg.SetIDOf(i),
|
||||
Count: ksg.Len(i),
|
||||
DiskSize: diskSize,
|
||||
Metadata: ksg.AllSetMetadata(i),
|
||||
}
|
||||
}
|
||||
summary.TotalDisk = totalDisk
|
||||
|
||||
// Jaccard matrix
|
||||
if _jaccard && ksg.Size() > 1 {
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
n := ksg.Size()
|
||||
matrix := make([][]float64, n)
|
||||
for i := 0; i < n; i++ {
|
||||
matrix[i] = make([]float64, n)
|
||||
for j := 0; j < n; j++ {
|
||||
if i == j {
|
||||
matrix[i][j] = 0
|
||||
} else {
|
||||
matrix[i][j] = dm.Get(i, j)
|
||||
}
|
||||
}
|
||||
}
|
||||
summary.Jaccard = matrix
|
||||
}
|
||||
|
||||
format := CLIOutFormat()
|
||||
switch format {
|
||||
case "json":
|
||||
return outputSummaryJSON(summary)
|
||||
case "yaml":
|
||||
return outputSummaryYAML(summary)
|
||||
case "csv":
|
||||
return outputSummaryCSV(summary)
|
||||
default:
|
||||
return outputSummaryJSON(summary)
|
||||
}
|
||||
}
|
||||
|
||||
func computeSetDiskSize(ksg *obikmer.KmerSetGroup, setIndex int) int64 {
|
||||
var total int64
|
||||
for p := 0; p < ksg.Partitions(); p++ {
|
||||
path := ksg.PartitionPath(setIndex, p)
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
total += info.Size()
|
||||
}
|
||||
// Also count the set directory entry itself
|
||||
setDir := filepath.Join(ksg.Path(), fmt.Sprintf("set_%d", setIndex))
|
||||
entries, err := os.ReadDir(setDir)
|
||||
if err == nil {
|
||||
// We already counted .kdi files above; this is just for completeness
|
||||
_ = entries
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func outputSummaryJSON(summary groupSummary) error {
|
||||
data, err := json.MarshalIndent(summary, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println(string(data))
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputSummaryYAML(summary groupSummary) error {
|
||||
data, err := yaml.Marshal(summary)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Print(string(data))
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputSummaryCSV(summary groupSummary) error {
|
||||
fmt.Println("index,id,count,disk_bytes")
|
||||
for _, s := range summary.Sets {
|
||||
fmt.Printf("%d,%s,%d,%d\n", s.Index, s.ID, s.Count, s.DiskSize)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
64
pkg/obitools/obik/super.go
Normal file
64
pkg/obitools/obik/super.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Super k-mer specific option variables.
|
||||
// These reuse _kmerSize and _minimizerSize from options.go since
|
||||
// only one subcommand runs at a time.
|
||||
|
||||
// SuperKmerOptionSet registers options specific to super k-mer extraction.
|
||||
func SuperKmerOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
|
||||
options.Alias("k"),
|
||||
options.Description("Size of k-mers (must be between m+1 and 31)."))
|
||||
|
||||
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
|
||||
options.Alias("m"),
|
||||
options.Description("Size of minimizers (must be between 1 and k-1)."))
|
||||
}
|
||||
|
||||
// runSuper implements the "obik super" subcommand.
|
||||
// It extracts super k-mers from DNA sequences.
|
||||
func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
k := _kmerSize
|
||||
m := _minimizerSize
|
||||
|
||||
if k < 2 || k > 31 {
|
||||
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||
}
|
||||
|
||||
if m < 1 || m >= k {
|
||||
return fmt.Errorf("invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
|
||||
}
|
||||
|
||||
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
|
||||
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open sequence files: %w", err)
|
||||
}
|
||||
|
||||
worker := obikmer.SuperKmerWorker(k, m)
|
||||
|
||||
superkmers := sequences.MakeIWorker(
|
||||
worker,
|
||||
false,
|
||||
obidefault.ParallelWorkers(),
|
||||
)
|
||||
|
||||
obiconvert.CLIWriteBioSequences(superkmers, true)
|
||||
obiutils.WaitForLastPipe()
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -1,91 +0,0 @@
|
||||
package obikindex
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
)
|
||||
|
||||
// CLIBuildKmerIndex reads sequences from the iterator and builds a
|
||||
// disk-based kmer index using the KmerSetGroupBuilder.
|
||||
func CLIBuildKmerIndex(iterator obiiter.IBioSequence) {
|
||||
// Validate output directory
|
||||
outDir := CLIOutputDirectory()
|
||||
if outDir == "" || outDir == "-" {
|
||||
log.Fatalf("Error: --out option is required and must specify a directory path (not stdout)")
|
||||
}
|
||||
|
||||
// Validate k-mer size
|
||||
k := CLIKmerSize()
|
||||
if k < 2 || k > 31 {
|
||||
log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||
}
|
||||
|
||||
// Resolve minimizer size
|
||||
m := CLIMinimizerSize()
|
||||
|
||||
// Validate min-occurrence
|
||||
minOcc := CLIMinOccurrence()
|
||||
if minOcc < 1 {
|
||||
log.Fatalf("Invalid min-occurrence: %d (must be >= 1)", minOcc)
|
||||
}
|
||||
|
||||
log.Infof("Building kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
|
||||
|
||||
// Build options
|
||||
var opts []obikmer.BuilderOption
|
||||
if minOcc > 1 {
|
||||
opts = append(opts, obikmer.WithMinFrequency(minOcc))
|
||||
}
|
||||
|
||||
// Create builder (1 set, auto partitions)
|
||||
builder, err := obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create kmer index builder: %v", err)
|
||||
}
|
||||
|
||||
// Feed sequences
|
||||
seqCount := 0
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
builder.AddSequence(0, seq)
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount)
|
||||
|
||||
// Finalize
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to finalize kmer index: %v", err)
|
||||
}
|
||||
|
||||
// Apply metadata
|
||||
applyMetadata(ksg)
|
||||
|
||||
if minOcc > 1 {
|
||||
ksg.SetAttribute("min_occurrence", minOcc)
|
||||
}
|
||||
|
||||
// Persist metadata updates
|
||||
if err := ksg.SaveMetadata(); err != nil {
|
||||
log.Fatalf("Failed to save metadata: %v", err)
|
||||
}
|
||||
|
||||
log.Infof("Index contains %d k-mers in %s", ksg.Len(0), outDir)
|
||||
log.Info("Done.")
|
||||
}
|
||||
|
||||
// applyMetadata sets index-id and --set-tag metadata on a KmerSetGroup.
|
||||
func applyMetadata(ksg *obikmer.KmerSetGroup) {
|
||||
if id := CLIIndexId(); id != "" {
|
||||
ksg.SetId(id)
|
||||
}
|
||||
|
||||
for key, value := range CLISetTag() {
|
||||
ksg.SetAttribute(key, value)
|
||||
}
|
||||
}
|
||||
@@ -1,332 +0,0 @@
|
||||
```{r}
|
||||
library(tidyverse)
|
||||
```
|
||||
|
||||
```{r}
|
||||
x <- sample(1:4096, 29, replace=TRUE)
|
||||
```
|
||||
|
||||
```{r}
|
||||
emax <- function(lseq,word_size) {
|
||||
nword = lseq - word_size + 1
|
||||
nalpha = 4^word_size
|
||||
|
||||
if (nalpha < nword) {
|
||||
cov = nword %/% nalpha
|
||||
remains = nword %% nalpha
|
||||
f1 = cov/nword
|
||||
f2 = (cov+1)/nword
|
||||
print(c(nalpha - remains,f1,remains,f2))
|
||||
e = -(nalpha - remains) * f1 * log(f1) -
|
||||
remains * f2 * log(f2)
|
||||
} else {
|
||||
e = log(nword)
|
||||
}
|
||||
|
||||
e
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
ec <- function(data,kmer_size) {
|
||||
table <- table(data)
|
||||
s <- sum(table)
|
||||
e <- sum(table * log(table))/s
|
||||
ed <- log(s) - e
|
||||
|
||||
em <- emax(s+kmer_size-1,kmer_size)
|
||||
|
||||
ed/em
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
ef <- function(data,kmer_size) {
|
||||
table <- table(data)
|
||||
s <- sum(table)
|
||||
f <- table / s
|
||||
|
||||
f <- as.numeric(f)
|
||||
f <- f[f > 0]
|
||||
|
||||
em <- emax(s+kmer_size-1,kmer_size)
|
||||
ed <- -sum(f * log(f))
|
||||
|
||||
print(c(ed,em,ed/em))
|
||||
|
||||
ed/em
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
okmer <- function(data,kmer_size) {
|
||||
str_sub(data,1:(nchar(data)-kmer_size+1)) %>%
|
||||
str_sub(1,kmer_size)
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Normalisation circulaire: retourne le plus petit k-mer par rotation circulaire
|
||||
normalize_circular <- function(kmer) {
|
||||
if (nchar(kmer) == 0) return(kmer)
|
||||
|
||||
canonical <- kmer
|
||||
n <- nchar(kmer)
|
||||
|
||||
# Tester toutes les rotations circulaires
|
||||
for (i in 2:n) {
|
||||
rotated <- paste0(str_sub(kmer, i, n), str_sub(kmer, 1, i-1))
|
||||
if (rotated < canonical) {
|
||||
canonical <- rotated
|
||||
}
|
||||
}
|
||||
|
||||
canonical
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Fonction totient d'Euler: compte le nombre d'entiers de 1 à n coprimes avec n
|
||||
euler_totient <- function(n) {
|
||||
if (n <= 0) return(0)
|
||||
|
||||
result <- n
|
||||
p <- 2
|
||||
|
||||
# Traiter tous les facteurs premiers
|
||||
while (p * p <= n) {
|
||||
if (n %% p == 0) {
|
||||
# Retirer toutes les occurrences de p
|
||||
while (n %% p == 0) {
|
||||
n <- n %/% p
|
||||
}
|
||||
# Appliquer la formule: φ(n) = n * (1 - 1/p)
|
||||
result <- result - result %/% p
|
||||
}
|
||||
p <- p + 1
|
||||
}
|
||||
|
||||
# Si n est toujours > 1, alors c'est un facteur premier
|
||||
if (n > 1) {
|
||||
result <- result - result %/% n
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Retourne tous les diviseurs de n
|
||||
divisors <- function(n) {
|
||||
if (n <= 0) return(integer(0))
|
||||
|
||||
divs <- c()
|
||||
i <- 1
|
||||
while (i * i <= n) {
|
||||
if (n %% i == 0) {
|
||||
divs <- c(divs, i)
|
||||
if (i != n %/% i) {
|
||||
divs <- c(divs, n %/% i)
|
||||
}
|
||||
}
|
||||
i <- i + 1
|
||||
}
|
||||
|
||||
sort(divs)
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Compte le nombre de colliers (necklaces) distincts de longueur n
|
||||
# sur un alphabet de taille a en utilisant la formule de Moreau:
|
||||
# N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
|
||||
# où la somme est sur tous les diviseurs d de n, et φ est la fonction totient d'Euler
|
||||
necklace_count <- function(n, alphabet_size) {
|
||||
if (n <= 0) return(0)
|
||||
|
||||
divs <- divisors(n)
|
||||
sum_val <- 0
|
||||
|
||||
for (d in divs) {
|
||||
# Calculer alphabet_size^(n/d)
|
||||
power <- alphabet_size^(n %/% d)
|
||||
sum_val <- sum_val + euler_totient(d) * power
|
||||
}
|
||||
|
||||
sum_val %/% n
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Nombre de classes d'équivalence pour les k-mers normalisés
|
||||
# Utilise la formule exacte de Moreau pour compter les colliers (necklaces)
|
||||
n_normalized_kmers <- function(kmer_size) {
|
||||
# Valeurs exactes pré-calculées pour k=1 à 6
|
||||
if (kmer_size == 1) return(4)
|
||||
if (kmer_size == 2) return(10)
|
||||
if (kmer_size == 3) return(24)
|
||||
if (kmer_size == 4) return(70)
|
||||
if (kmer_size == 5) return(208)
|
||||
if (kmer_size == 6) return(700)
|
||||
|
||||
# Pour k > 6, utiliser la formule de Moreau (exacte)
|
||||
# Alphabet ADN a 4 bases
|
||||
necklace_count(kmer_size, 4)
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Entropie maximale pour k-mers normalisés
|
||||
enmax <- function(lseq, word_size) {
|
||||
nword = lseq - word_size + 1
|
||||
nalpha = n_normalized_kmers(word_size)
|
||||
|
||||
if (nalpha < nword) {
|
||||
cov = nword %/% nalpha
|
||||
remains = nword %% nalpha
|
||||
f1 = cov/nword
|
||||
f2 = (cov+1)/nword
|
||||
e = -(nalpha - remains) * f1 * log(f1) -
|
||||
remains * f2 * log(f2)
|
||||
} else {
|
||||
e = log(nword)
|
||||
}
|
||||
|
||||
e
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Entropie normalisée avec normalisation circulaire des k-mers
|
||||
ecn <- function(data, kmer_size) {
|
||||
# Normaliser tous les k-mers
|
||||
normalized_data <- sapply(data, normalize_circular)
|
||||
|
||||
# Calculer la table des fréquences
|
||||
table <- table(normalized_data)
|
||||
s <- sum(table)
|
||||
e <- sum(table * log(table))/s
|
||||
ed <- log(s) - e
|
||||
|
||||
# Entropie maximale avec normalisation
|
||||
em <- enmax(s + kmer_size - 1, kmer_size)
|
||||
|
||||
ed/em
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
k<-'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
|
||||
ec(okmer(k,1),1)
|
||||
ec(okmer(k,2),2)
|
||||
ec(okmer(k,3),3)
|
||||
ec(okmer(k,4),4)
|
||||
```
|
||||
|
||||
```{r}
|
||||
k<-'atatatatatatatatatatatatatatata'
|
||||
ef(okmer(k,1),1)
|
||||
ef(okmer(k,2),2)
|
||||
ef(okmer(k,3),3)
|
||||
ef(okmer(k,4),4)
|
||||
```
|
||||
|
||||
```{r}
|
||||
k<-'aaaaaaaaaaaaaaaattttttttttttttt'
|
||||
ef(okmer(k,1),1)
|
||||
ef(okmer(k,2),2)
|
||||
ef(okmer(k,3),3)
|
||||
ef(okmer(k,4),4)
|
||||
```
|
||||
|
||||
```{r}
|
||||
k<-'atgatgatgatgatgatgatgatgatgatga'
|
||||
ef(okmer(k,1),1)
|
||||
ef(okmer(k,2),2)
|
||||
ef(okmer(k,3),3)
|
||||
ef(okmer(k,4),4)
|
||||
```
|
||||
|
||||
```{r}
|
||||
k<-'atcgatcgatcgatcgatcgatcgatcgact'
|
||||
ecn(okmer(k,1),1)
|
||||
ecn(okmer(k,2),2)
|
||||
ecn(okmer(k,3),3)
|
||||
ecn(okmer(k,4),4)
|
||||
```
|
||||
|
||||
```{r}
|
||||
k<-paste(sample(rep(c("a","c","g","t"),8),31),collapse="")
|
||||
k <- "actatggcaagtcgtaaccgcgcttatcagg"
|
||||
ecn(okmer(k,1),1)
|
||||
ecn(okmer(k,2),2)
|
||||
ecn(okmer(k,3),3)
|
||||
ecn(okmer(k,4),4)
|
||||
```
|
||||
|
||||
aattaaaaaaacaagataaaataatattttt
|
||||
|
||||
```{r}
|
||||
k<-'aattaaaaaaacaagataaaataatattttt'
|
||||
ecn(okmer(k,1),1)
|
||||
ecn(okmer(k,2),2)
|
||||
ecn(okmer(k,3),3)
|
||||
ecn(okmer(k,4),4)
|
||||
```
|
||||
|
||||
atg tga gat ,,,,
|
||||
|
||||
cat tca atc
|
||||
|
||||
tgatgatgatgatgatgatgatgatgatg
|
||||
|
||||
## Tests de normalisation circulaire
|
||||
|
||||
```{r}
|
||||
# Test de la fonction de normalisation
|
||||
normalize_circular("ca") # devrait donner "ac"
|
||||
normalize_circular("tgca") # devrait donner "atgc"
|
||||
normalize_circular("acgt") # devrait donner "acgt"
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Comparaison ec vs ecn sur une séquence répétitive
|
||||
# Les k-mers "atg", "tga", "gat" sont équivalents par rotation
|
||||
k <- 'atgatgatgatgatgatgatgatgatgatga'
|
||||
cat("Séquence:", k, "\n")
|
||||
cat("ec(k,3) =", ec(okmer(k,3),3), "\n")
|
||||
cat("ecn(k,3) =", ecn(okmer(k,3),3), "\n")
|
||||
```
|
||||
|
||||
```{r}
|
||||
# Comparaison sur séquence aléatoire
|
||||
k <- "actatggcaagtcgtaaccgcgcttatcagg"
|
||||
cat("Séquence:", k, "\n")
|
||||
cat("Sans normalisation:\n")
|
||||
cat(" ec(k,2) =", ec(okmer(k,2),2), "\n")
|
||||
cat(" ec(k,3) =", ec(okmer(k,3),3), "\n")
|
||||
cat(" ec(k,4) =", ec(okmer(k,4),4), "\n")
|
||||
cat("Avec normalisation circulaire:\n")
|
||||
cat(" ecn(k,2) =", ecn(okmer(k,2),2), "\n")
|
||||
cat(" ecn(k,3) =", ecn(okmer(k,3),3), "\n")
|
||||
cat(" ecn(k,4) =", ecn(okmer(k,4),4), "\n")
|
||||
```
|
||||
|
||||
```{r}
|
||||
|
||||
sequence <- "ttcatcactcagcaatcctgaatgatGAGAGCTTTTTTTTTTTATATATATATATATGTATATGTATGAAATACACTtatgctccgtttgtttcgccgtaa"
|
||||
re <- rev(c(0.8108602271901116,0.8108602271901116,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.7800272339058549,0.7800272339058549,0.7751610144606091,0.7751610144606091,0.7751610144606091,0.764858185548322,0.7325526601302021,0.7137620699527615,0.6789199521982864,0.6584536373623372,0.634002687184193,0.6075290415873623,0.5785545803330997,0.5785545803330997,0.5503220289212184,0.5315314387437778,0.4966893209893028,0.46077361820145696,0.42388221293245526,0.4009547969713408,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.390029016052137,0.42781461756157363,0.45192285937059073,0.47238917420654,0.47238917420654,0.47238917420654,0.5092805794755417,0.5451962822633876,0.5800384000178626,0.602395141014297,0.6046146614886381,0.6046146614886381,0.6119084258128231,0.6119084258128231,0.6214217106113492,0.6424704346756562,0.6482381543085467,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6828444721058894,0.6950205907027562,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.7208976112999935))
|
||||
|
||||
di <- c(0.7208976112999935,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6950205907027562,0.6828444721058894,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6482381543085467,0.6424704346756562,0.6214217106113492,0.6119084258128231,0.6119084258128231,0.6046146614886382,0.6046146614886382,0.6023951410142971,0.5800384000178627,0.5451962822633876,0.5092805794755418,0.47238917420654003,0.47238917420654003,0.47238917420654003,0.4519228593705908,0.4278146175615737,0.39002901605213713,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677883,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.40095479697134073,0.42388221293245526,0.46077361820145696,0.4966893209893028,0.5315314387437778,0.5503220289212184,0.5785545803330997,0.5785545803330997,0.6075290415873625,0.6340026871841933,0.6584536373623374,0.6789199521982866,0.7137620699527616,0.7325526601302023,0.7648581855483221,0.7751610144606093,0.7751610144606093,0.7751610144606093,0.7800272339058549,0.7800272339058549,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8108602271901116,0.8108602271901116)
|
||||
|
||||
ebidir <- tibble(direct=di,reverse=re) %>%
|
||||
mutate(position = 1:length(re),
|
||||
nucleotide = str_sub(sequence,position,position))
|
||||
|
||||
ebidir %>%
|
||||
ggplot(aes(x=position,y=direct)) +
|
||||
geom_line() +
|
||||
scale_x_continuous(breaks = ebidir$position, labels = ebidir$nucleotide) +
|
||||
ylim(0,1)+
|
||||
geom_hline(yintercept=0.5, col = "red", linetype = "dashed")
|
||||
```
|
||||
@@ -1,40 +0,0 @@
|
||||
package obilowmask
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
func TestLowMaskWorker(t *testing.T) {
|
||||
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test")
|
||||
result, err := worker(seq)
|
||||
if err != nil {
|
||||
t.Fatalf("Worker failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Len() != 1 {
|
||||
t.Fatalf("Expected 1 sequence, got %d", result.Len())
|
||||
}
|
||||
|
||||
resultSeq := result[0]
|
||||
if resultSeq.Len() != 32 {
|
||||
t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len())
|
||||
}
|
||||
}
|
||||
|
||||
func TestLowMaskWorkerWithAmbiguity(t *testing.T) {
|
||||
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test")
|
||||
result, err := worker(seq)
|
||||
if err != nil {
|
||||
t.Fatalf("Worker failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Len() != 1 {
|
||||
t.Fatalf("Expected 1 sequence, got %d", result.Len())
|
||||
}
|
||||
}
|
||||
@@ -1,81 +0,0 @@
|
||||
package obilowmask
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
var __kmer_size__ = 31
|
||||
var __level_max__ = 6
|
||||
var __threshold__ = 0.5
|
||||
var __split_mode__ = false
|
||||
var __low_mode__ = false
|
||||
var __mask__ = "."
|
||||
|
||||
func LowMaskOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.IntVar(&__kmer_size__, "kmer-size", __kmer_size__,
|
||||
options.Description("Size of the kmer considered to estimate entropy."),
|
||||
)
|
||||
|
||||
options.IntVar(&__level_max__, "entropy_size", __level_max__,
|
||||
options.Description("Maximum word size considered for entropy estimate"),
|
||||
)
|
||||
|
||||
options.Float64Var(&__threshold__, "threshold", __threshold__,
|
||||
options.Description("entropy theshold used to mask a kmer"),
|
||||
)
|
||||
|
||||
options.BoolVar(&__split_mode__, "split-mode", __split_mode__,
|
||||
options.Description("in split mode, input sequences are splitted to remove masked regions"),
|
||||
)
|
||||
|
||||
options.BoolVar(&__low_mode__, "low-mode", __low_mode__,
|
||||
options.Description("in split mode, input sequences are splitted to remove masked regions"),
|
||||
)
|
||||
|
||||
options.StringVar(&__mask__, "masking-char", __mask__,
|
||||
options.Description("Character used to mask low complexity region"),
|
||||
)
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
LowMaskOptionSet(options)
|
||||
obiconvert.InputOptionSet(options)
|
||||
obiconvert.OutputOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIKmerSize() int {
|
||||
return __kmer_size__
|
||||
}
|
||||
|
||||
func CLILevelMax() int {
|
||||
return __level_max__
|
||||
}
|
||||
|
||||
func CLIThreshold() float64 {
|
||||
return __threshold__
|
||||
}
|
||||
|
||||
func CLIMaskingMode() MaskingMode {
|
||||
switch {
|
||||
case __low_mode__:
|
||||
return Extract
|
||||
case __split_mode__:
|
||||
return Split
|
||||
default:
|
||||
return Mask
|
||||
}
|
||||
}
|
||||
|
||||
func CLIMaskingChar() byte {
|
||||
mask := strings.TrimSpace(__mask__)
|
||||
if len(mask) != 1 {
|
||||
log.Fatalf("--masking-char option accept a single character, not %s", mask)
|
||||
}
|
||||
return []byte(mask)[0]
|
||||
}
|
||||
@@ -1,10 +0,0 @@
|
||||
// obisuperkmer function utility package.
|
||||
//
|
||||
// The obitools/obisuperkmer package contains every
|
||||
// function specifically required by the obisuperkmer utility.
|
||||
//
|
||||
// The obisuperkmer command extracts super k-mers from DNA sequences.
|
||||
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||||
// share the same minimizer. This decomposition is useful for efficient
|
||||
// k-mer indexing and analysis in bioinformatics applications.
|
||||
package obisuperkmer
|
||||
@@ -1,69 +0,0 @@
|
||||
package obisuperkmer
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Private variables for storing option values
|
||||
var _KmerSize = 31
|
||||
var _MinimizerSize = 13
|
||||
|
||||
// SuperKmerOptionSet defines every option related to super k-mer extraction.
|
||||
//
|
||||
// The function adds to a CLI every option proposed to the user
|
||||
// to tune the parameters of the super k-mer extraction algorithm.
|
||||
//
|
||||
// Parameters:
|
||||
// - options: is a pointer to a getoptions.GetOpt instance normally
|
||||
// produced by the obioptions.GenerateOptionParser function.
|
||||
func SuperKmerOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_KmerSize, "kmer-size", _KmerSize,
|
||||
options.Alias("k"),
|
||||
options.Description("Size of k-mers (must be between m+1 and 31)."))
|
||||
|
||||
options.IntVar(&_MinimizerSize, "minimizer-size", _MinimizerSize,
|
||||
options.Alias("m"),
|
||||
options.Description("Size of minimizers (must be between 1 and k-1)."))
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every option declared for
|
||||
// the obisuperkmer command.
|
||||
//
|
||||
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(false)(options)
|
||||
SuperKmerOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the k-mer size to use for super k-mer extraction.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns an integer representing the k-mer size.
|
||||
func CLIKmerSize() int {
|
||||
return _KmerSize
|
||||
}
|
||||
|
||||
// SetKmerSize sets the k-mer size for super k-mer extraction.
|
||||
//
|
||||
// Parameters:
|
||||
// - k: the k-mer size (must be between m+1 and 31).
|
||||
func SetKmerSize(k int) {
|
||||
_KmerSize = k
|
||||
}
|
||||
|
||||
// CLIMinimizerSize returns the minimizer size to use for super k-mer extraction.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns an integer representing the minimizer size.
|
||||
func CLIMinimizerSize() int {
|
||||
return _MinimizerSize
|
||||
}
|
||||
|
||||
// SetMinimizerSize sets the minimizer size for super k-mer extraction.
|
||||
//
|
||||
// Parameters:
|
||||
// - m: the minimizer size (must be between 1 and k-1).
|
||||
func SetMinimizerSize(m int) {
|
||||
_MinimizerSize = m
|
||||
}
|
||||
@@ -1,59 +0,0 @@
|
||||
package obisuperkmer
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
)
|
||||
|
||||
// CLIExtractSuperKmers extracts super k-mers from an iterator of BioSequences.
|
||||
//
|
||||
// This function takes an iterator of BioSequence objects, extracts super k-mers
|
||||
// from each sequence using the k-mer and minimizer sizes specified by CLI options,
|
||||
// and returns a new iterator yielding the extracted super k-mers as BioSequence objects.
|
||||
//
|
||||
// Each super k-mer is a maximal subsequence where all consecutive k-mers share
|
||||
// the same minimizer. The resulting BioSequences contain metadata including:
|
||||
// - minimizer_value: the canonical minimizer value
|
||||
// - minimizer_seq: the DNA sequence of the minimizer
|
||||
// - k: the k-mer size used
|
||||
// - m: the minimizer size used
|
||||
// - start: starting position in the original sequence
|
||||
// - end: ending position in the original sequence
|
||||
// - parent_id: ID of the parent sequence
|
||||
//
|
||||
// Parameters:
|
||||
// - iterator: an iterator yielding BioSequence objects to process.
|
||||
//
|
||||
// Returns:
|
||||
// - An iterator yielding BioSequence objects representing super k-mers.
|
||||
func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
// Get k-mer and minimizer sizes from CLI options
|
||||
k := CLIKmerSize()
|
||||
m := CLIMinimizerSize()
|
||||
|
||||
// Validate parameters
|
||||
if m < 1 || m >= k {
|
||||
log.Fatalf("Invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
|
||||
}
|
||||
|
||||
if k < 2 || k > 31 {
|
||||
log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||
}
|
||||
|
||||
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
|
||||
|
||||
// Create the worker for super k-mer extraction
|
||||
worker := obikmer.SuperKmerWorker(k, m)
|
||||
|
||||
// Apply the worker to the iterator with parallel processing
|
||||
newIter := iterator.MakeIWorker(
|
||||
worker,
|
||||
false, // don't merge results
|
||||
obidefault.ParallelWorkers(),
|
||||
)
|
||||
|
||||
return newIter
|
||||
}
|
||||
@@ -1,149 +0,0 @@
|
||||
package obisuperkmer
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
func TestCLIExtractSuperKmers(t *testing.T) {
|
||||
// Create a test sequence
|
||||
testSeq := obiseq.NewBioSequence(
|
||||
"test_seq",
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
|
||||
"",
|
||||
)
|
||||
|
||||
// Create a batch with the test sequence
|
||||
batch := obiseq.NewBioSequenceBatch()
|
||||
batch.Add(testSeq)
|
||||
|
||||
// Create an iterator from the batch
|
||||
iterator := obiiter.MakeBioSequenceBatchChannel(1)
|
||||
go func() {
|
||||
iterator.Push(batch)
|
||||
iterator.Close()
|
||||
}()
|
||||
|
||||
// Set test parameters
|
||||
SetKmerSize(15)
|
||||
SetMinimizerSize(7)
|
||||
|
||||
// Extract super k-mers
|
||||
result := CLIExtractSuperKmers(iterator)
|
||||
|
||||
// Count the number of super k-mers
|
||||
count := 0
|
||||
for result.Next() {
|
||||
batch := result.Get()
|
||||
for _, sk := range batch.Slice() {
|
||||
count++
|
||||
|
||||
// Verify that the super k-mer has the expected attributes
|
||||
if !sk.HasAttribute("minimizer_value") {
|
||||
t.Error("Super k-mer missing 'minimizer_value' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("minimizer_seq") {
|
||||
t.Error("Super k-mer missing 'minimizer_seq' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("k") {
|
||||
t.Error("Super k-mer missing 'k' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("m") {
|
||||
t.Error("Super k-mer missing 'm' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("start") {
|
||||
t.Error("Super k-mer missing 'start' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("end") {
|
||||
t.Error("Super k-mer missing 'end' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("parent_id") {
|
||||
t.Error("Super k-mer missing 'parent_id' attribute")
|
||||
}
|
||||
|
||||
// Verify attribute values
|
||||
k, _ := sk.GetIntAttribute("k")
|
||||
m, _ := sk.GetIntAttribute("m")
|
||||
|
||||
if k != 15 {
|
||||
t.Errorf("Expected k=15, got k=%d", k)
|
||||
}
|
||||
if m != 7 {
|
||||
t.Errorf("Expected m=7, got m=%d", m)
|
||||
}
|
||||
|
||||
parentID, _ := sk.GetStringAttribute("parent_id")
|
||||
if parentID != "test_seq" {
|
||||
t.Errorf("Expected parent_id='test_seq', got '%s'", parentID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if count == 0 {
|
||||
t.Error("No super k-mers were extracted")
|
||||
}
|
||||
|
||||
t.Logf("Extracted %d super k-mers from test sequence", count)
|
||||
}
|
||||
|
||||
func TestOptionGettersAndSetters(t *testing.T) {
|
||||
// Test initial values
|
||||
if CLIKmerSize() != 21 {
|
||||
t.Errorf("Expected default k-mer size 21, got %d", CLIKmerSize())
|
||||
}
|
||||
if CLIMinimizerSize() != 11 {
|
||||
t.Errorf("Expected default minimizer size 11, got %d", CLIMinimizerSize())
|
||||
}
|
||||
|
||||
// Test setters
|
||||
SetKmerSize(25)
|
||||
SetMinimizerSize(13)
|
||||
|
||||
if CLIKmerSize() != 25 {
|
||||
t.Errorf("SetKmerSize failed: expected 25, got %d", CLIKmerSize())
|
||||
}
|
||||
if CLIMinimizerSize() != 13 {
|
||||
t.Errorf("SetMinimizerSize failed: expected 13, got %d", CLIMinimizerSize())
|
||||
}
|
||||
|
||||
// Reset to defaults
|
||||
SetKmerSize(21)
|
||||
SetMinimizerSize(11)
|
||||
}
|
||||
|
||||
func BenchmarkCLIExtractSuperKmers(b *testing.B) {
|
||||
// Create a longer test sequence
|
||||
longSeq := make([]byte, 1000)
|
||||
bases := []byte{'A', 'C', 'G', 'T'}
|
||||
for i := range longSeq {
|
||||
longSeq[i] = bases[i%4]
|
||||
}
|
||||
|
||||
testSeq := obiseq.NewBioSequence("bench_seq", longSeq, "")
|
||||
|
||||
// Set parameters
|
||||
SetKmerSize(21)
|
||||
SetMinimizerSize(11)
|
||||
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
batch := obiseq.NewBioSequenceBatch()
|
||||
batch.Add(testSeq)
|
||||
|
||||
iterator := obiiter.MakeBioSequenceBatchChannel(1)
|
||||
go func() {
|
||||
iterator.Push(batch)
|
||||
iterator.Close()
|
||||
}()
|
||||
|
||||
result := CLIExtractSuperKmers(iterator)
|
||||
|
||||
// Consume the iterator
|
||||
for result.Next() {
|
||||
result.Get()
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user