mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-26 14:00:51 +00:00
Refactor k-mer index management with subcommands and enhanced metadata support
This commit refactors the k-mer index management tools to use a unified subcommand structure with obik, adds support for per-set metadata and ID management, enhances the k-mer set group builder to support appending to existing groups, and improves command-line option handling with a new global options registration system. Key changes: - Introduce obik command with subcommands (index, ls, summary, cp, mv, rm, super, lowmask) - Add support for per-set metadata and ID management in kmer set groups - Implement ability to append to existing kmer index groups - Refactor option parsing to use a global options registration system - Add new commands for listing, copying, moving, and removing sets - Enhance low-complexity masking with new options and output formats - Improve kmer index summary with Jaccard distance matrix support - Remove deprecated obikindex and obisuperkmer commands - Update build process to use the new subcommand structure
This commit is contained in:
55
pkg/obitools/obik/cp.go
Normal file
55
pkg/obitools/obik/cp.go
Normal file
@@ -0,0 +1,55 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
func runCp(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 2 {
|
||||
return fmt.Errorf("usage: obik cp [--set PATTERN]... [--force] <source_index> <dest_index>")
|
||||
}
|
||||
|
||||
srcDir := args[0]
|
||||
destDir := args[1]
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(srcDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open source kmer index: %w", err)
|
||||
}
|
||||
|
||||
// Resolve set patterns
|
||||
patterns := CLISetPatterns()
|
||||
var ids []string
|
||||
if len(patterns) > 0 {
|
||||
indices, err := ksg.MatchSetIDs(patterns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(indices) == 0 {
|
||||
return fmt.Errorf("no sets match the given patterns")
|
||||
}
|
||||
ids = make([]string, len(indices))
|
||||
for i, idx := range indices {
|
||||
ids[i] = ksg.SetIDOf(idx)
|
||||
}
|
||||
} else {
|
||||
// Copy all sets
|
||||
ids = ksg.SetsIDs()
|
||||
}
|
||||
|
||||
log.Infof("Copying %d set(s) from %s to %s", len(ids), srcDir, destDir)
|
||||
|
||||
dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Infof("Destination now has %d set(s)", dest.Size())
|
||||
return nil
|
||||
}
|
||||
110
pkg/obitools/obik/index.go
Normal file
110
pkg/obitools/obik/index.go
Normal file
@@ -0,0 +1,110 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
outDir := CLIOutputDirectory()
|
||||
if outDir == "" || outDir == "-" {
|
||||
return fmt.Errorf("--out option is required and must specify a directory path")
|
||||
}
|
||||
|
||||
k := CLIKmerSize()
|
||||
if k < 2 || k > 31 {
|
||||
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||
}
|
||||
|
||||
m := CLIMinimizerSize()
|
||||
|
||||
minOcc := CLIMinOccurrence()
|
||||
if minOcc < 1 {
|
||||
return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc)
|
||||
}
|
||||
|
||||
// Build options
|
||||
var opts []obikmer.BuilderOption
|
||||
if minOcc > 1 {
|
||||
opts = append(opts, obikmer.WithMinFrequency(minOcc))
|
||||
}
|
||||
|
||||
// Determine whether to append to existing group or create new
|
||||
var builder *obikmer.KmerSetGroupBuilder
|
||||
var err error
|
||||
metaPath := filepath.Join(outDir, "metadata.toml")
|
||||
if _, statErr := os.Stat(metaPath); statErr == nil {
|
||||
// Existing group: append
|
||||
log.Infof("Appending to existing kmer index at %s", outDir)
|
||||
builder, err = obikmer.AppendKmerSetGroupBuilder(outDir, 1, opts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open existing kmer index for appending: %w", err)
|
||||
}
|
||||
} else {
|
||||
// New group
|
||||
log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
|
||||
builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create kmer index builder: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Read and process sequences
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open sequence files: %w", err)
|
||||
}
|
||||
|
||||
seqCount := 0
|
||||
for sequences.Next() {
|
||||
batch := sequences.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
builder.AddSequence(0, seq)
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount)
|
||||
|
||||
// Finalize
|
||||
ksg, err := builder.Close()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to finalize kmer index: %w", err)
|
||||
}
|
||||
|
||||
// Apply index-id to the new set
|
||||
newSetIdx := builder.StartIndex()
|
||||
if id := CLIIndexId(); id != "" {
|
||||
ksg.SetSetID(newSetIdx, id)
|
||||
}
|
||||
|
||||
// Apply group-level tags (-S)
|
||||
for key, value := range CLISetTag() {
|
||||
ksg.SetAttribute(key, value)
|
||||
}
|
||||
|
||||
// Apply per-set tags (-T) to the new set
|
||||
for key, value := range _setMetaTags {
|
||||
ksg.SetSetMetadata(newSetIdx, key, value)
|
||||
}
|
||||
|
||||
if minOcc > 1 {
|
||||
ksg.SetAttribute("min_occurrence", minOcc)
|
||||
}
|
||||
|
||||
if err := ksg.SaveMetadata(); err != nil {
|
||||
return fmt.Errorf("failed to save metadata: %w", err)
|
||||
}
|
||||
|
||||
log.Infof("Index contains %d k-mers for set %d in %s", ksg.Len(newSetIdx), newSetIdx, outDir)
|
||||
log.Info("Done.")
|
||||
return nil
|
||||
}
|
||||
457
pkg/obitools/obik/lowmask.go
Normal file
457
pkg/obitools/obik/lowmask.go
Normal file
@@ -0,0 +1,457 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// MaskingMode defines how to handle low-complexity regions
|
||||
type MaskingMode int
|
||||
|
||||
const (
|
||||
MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters
|
||||
SplitMode // Split sequence into high-complexity fragments
|
||||
ExtractMode // Extract low-complexity fragments
|
||||
)
|
||||
|
||||
// Lowmask-specific option variables (separate from index/super kmer-size).
|
||||
var _lowmaskKmerSize = 31
|
||||
var _lowmaskLevelMax = 6
|
||||
var _lowmaskThreshold = 0.5
|
||||
var _lowmaskSplitMode = false
|
||||
var _lowmaskLowMode = false
|
||||
var _lowmaskMaskChar = "."
|
||||
|
||||
// LowMaskOptionSet registers options specific to low-complexity masking.
|
||||
func LowMaskOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_lowmaskKmerSize, "kmer-size", _lowmaskKmerSize,
|
||||
options.Description("Size of the kmer considered to estimate entropy."))
|
||||
|
||||
options.IntVar(&_lowmaskLevelMax, "entropy-size", _lowmaskLevelMax,
|
||||
options.Description("Maximum word size considered for entropy estimate."))
|
||||
|
||||
options.Float64Var(&_lowmaskThreshold, "threshold", _lowmaskThreshold,
|
||||
options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
|
||||
|
||||
options.BoolVar(&_lowmaskSplitMode, "split-mode", _lowmaskSplitMode,
|
||||
options.Description("Split sequences to remove masked regions."))
|
||||
|
||||
options.BoolVar(&_lowmaskLowMode, "low-mode", _lowmaskLowMode,
|
||||
options.Description("Extract only low-complexity regions."))
|
||||
|
||||
options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar,
|
||||
options.Description("Character used to mask low complexity regions."))
|
||||
}
|
||||
|
||||
func lowmaskMaskingMode() MaskingMode {
|
||||
switch {
|
||||
case _lowmaskLowMode:
|
||||
return ExtractMode
|
||||
case _lowmaskSplitMode:
|
||||
return SplitMode
|
||||
default:
|
||||
return MaskMode
|
||||
}
|
||||
}
|
||||
|
||||
func lowmaskMaskingChar() byte {
|
||||
mask := strings.TrimSpace(_lowmaskMaskChar)
|
||||
if len(mask) != 1 {
|
||||
log.Fatalf("--masking-char option accepts a single character, not %s", mask)
|
||||
}
|
||||
return []byte(mask)[0]
|
||||
}
|
||||
|
||||
// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
|
||||
func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
|
||||
|
||||
nLogN := make([]float64, kmer_size+1)
|
||||
for i := 1; i <= kmer_size; i++ {
|
||||
nLogN[i] = float64(i) * math.Log(float64(i))
|
||||
}
|
||||
|
||||
normTables := make([][]int, level_max+1)
|
||||
for ws := 1; ws <= level_max; ws++ {
|
||||
size := 1 << (ws * 2)
|
||||
normTables[ws] = make([]int, size)
|
||||
for code := 0; code < size; code++ {
|
||||
normTables[ws][code] = int(obikmer.NormalizeCircular(uint64(code), ws))
|
||||
}
|
||||
}
|
||||
|
||||
type pair struct {
|
||||
index int
|
||||
value float64
|
||||
}
|
||||
|
||||
slidingMin := func(data []float64, window int) {
|
||||
if len(data) == 0 || window <= 0 {
|
||||
return
|
||||
}
|
||||
if window >= len(data) {
|
||||
minVal := data[0]
|
||||
for i := 1; i < len(data); i++ {
|
||||
if data[i] < minVal {
|
||||
minVal = data[i]
|
||||
}
|
||||
}
|
||||
for i := range data {
|
||||
data[i] = minVal
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
deque := make([]pair, 0, window)
|
||||
|
||||
for i, v := range data {
|
||||
for len(deque) > 0 && deque[0].index <= i-window {
|
||||
deque = deque[1:]
|
||||
}
|
||||
|
||||
for len(deque) > 0 && deque[len(deque)-1].value >= v {
|
||||
deque = deque[:len(deque)-1]
|
||||
}
|
||||
|
||||
deque = append(deque, pair{index: i, value: v})
|
||||
|
||||
data[i] = deque[0].value
|
||||
}
|
||||
}
|
||||
|
||||
emaxValues := make([]float64, level_max+1)
|
||||
logNwords := make([]float64, level_max+1)
|
||||
for ws := 1; ws <= level_max; ws++ {
|
||||
nw := kmer_size - ws + 1
|
||||
na := obikmer.CanonicalCircularKmerCount(ws)
|
||||
if nw < na {
|
||||
logNwords[ws] = math.Log(float64(nw))
|
||||
emaxValues[ws] = math.Log(float64(nw))
|
||||
} else {
|
||||
cov := nw / na
|
||||
remains := nw - (na * cov)
|
||||
f1 := float64(cov) / float64(nw)
|
||||
f2 := float64(cov+1) / float64(nw)
|
||||
logNwords[ws] = math.Log(float64(nw))
|
||||
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
||||
float64(remains)*f2*math.Log(f2))
|
||||
}
|
||||
}
|
||||
|
||||
maskAmbiguities := func(sequence []byte) []int {
|
||||
maskPositions := make([]int, len(sequence))
|
||||
for i, nuc := range sequence {
|
||||
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||
end := max(0, i-kmer_size+1)
|
||||
for j := i; j >= end; j-- {
|
||||
maskPositions[j] = -1
|
||||
}
|
||||
}
|
||||
}
|
||||
return maskPositions
|
||||
}
|
||||
|
||||
cleanTable := func(table []int, over int) {
|
||||
for i := 0; i < over; i++ {
|
||||
table[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
computeEntropies := func(sequence []byte,
|
||||
maskPositions []int,
|
||||
entropies []float64,
|
||||
table []int,
|
||||
words []int,
|
||||
wordSize int,
|
||||
normTable []int) {
|
||||
|
||||
lseq := len(sequence)
|
||||
tableSize := 1 << (wordSize * 2)
|
||||
nwords := kmer_size - wordSize + 1
|
||||
float_nwords := float64(nwords)
|
||||
log_nwords := logNwords[wordSize]
|
||||
entropyMax := emaxValues[wordSize]
|
||||
|
||||
cleanTable(table, tableSize)
|
||||
|
||||
for i := 1; i < lseq; i++ {
|
||||
entropies[i] = 6
|
||||
}
|
||||
end := lseq - wordSize + 1
|
||||
|
||||
mask := (1 << (wordSize * 2)) - 1
|
||||
|
||||
word_index := 0
|
||||
for i := 0; i < wordSize-1; i++ {
|
||||
word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
|
||||
}
|
||||
|
||||
for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
|
||||
word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
|
||||
words[i] = normTable[word_index]
|
||||
}
|
||||
|
||||
s := 0
|
||||
sum_n_logn := 0.0
|
||||
entropy := 1.0
|
||||
cleaned := true
|
||||
|
||||
for i := range end {
|
||||
s++
|
||||
|
||||
switch {
|
||||
case s < nwords:
|
||||
cleaned = false
|
||||
table[words[i]]++
|
||||
|
||||
case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
|
||||
entropies[i-nwords+1] = 4.0
|
||||
if !cleaned {
|
||||
cleanTable(table, tableSize)
|
||||
}
|
||||
cleaned = true
|
||||
s = 0
|
||||
sum_n_logn = 0.0
|
||||
|
||||
case s == nwords:
|
||||
cleaned = false
|
||||
table[words[i]]++
|
||||
|
||||
sum_n_logn = 0
|
||||
for j := range tableSize {
|
||||
n := float64(table[j])
|
||||
if n > 0 {
|
||||
sum_n_logn += nLogN[int(n)]
|
||||
}
|
||||
}
|
||||
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||
|
||||
case s > nwords:
|
||||
cleaned = false
|
||||
|
||||
new_word := words[i]
|
||||
old_word := words[i-nwords]
|
||||
|
||||
if old_word != new_word {
|
||||
table[new_word]++
|
||||
table[old_word]--
|
||||
|
||||
n_old := float64(table[old_word])
|
||||
n_new := float64(table[new_word])
|
||||
|
||||
sum_n_logn -= nLogN[int(n_old+1)]
|
||||
if n_old > 0 {
|
||||
sum_n_logn += nLogN[int(n_old)]
|
||||
}
|
||||
if n_new > 0 {
|
||||
sum_n_logn += nLogN[int(n_new)]
|
||||
}
|
||||
if n_new > 1 {
|
||||
sum_n_logn -= nLogN[int(n_new-1)]
|
||||
}
|
||||
}
|
||||
|
||||
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||
}
|
||||
|
||||
if s >= nwords && maskPositions[i-nwords+1] >= 0 {
|
||||
if entropy < 0 {
|
||||
entropy = 0
|
||||
}
|
||||
entropy = math.Round(entropy*10000) / 10000
|
||||
entropies[i-nwords+1] = entropy
|
||||
}
|
||||
}
|
||||
|
||||
slidingMin(entropies, kmer_size)
|
||||
}
|
||||
|
||||
applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
|
||||
seqCopy := sequence.Copy()
|
||||
sequenceBytes := seqCopy.Sequence()
|
||||
|
||||
for i := range sequenceBytes {
|
||||
if maskPositions[i] {
|
||||
sequenceBytes[i] = mask
|
||||
}
|
||||
}
|
||||
|
||||
return obiseq.BioSequenceSlice{seqCopy}, nil
|
||||
}
|
||||
|
||||
selectMasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
|
||||
rep := obiseq.NewBioSequenceSlice()
|
||||
|
||||
inlow := false
|
||||
fromlow := -1
|
||||
for i, masked := range maskPosition {
|
||||
if masked && !inlow {
|
||||
fromlow = i
|
||||
inlow = true
|
||||
}
|
||||
if inlow && !masked {
|
||||
if fromlow >= 0 {
|
||||
frg, err := sequence.Subsequence(fromlow, i, false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rep.Push(frg)
|
||||
}
|
||||
inlow = false
|
||||
fromlow = -1
|
||||
}
|
||||
}
|
||||
|
||||
if inlow && fromlow >= 0 {
|
||||
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rep.Push(frg)
|
||||
}
|
||||
|
||||
return *rep, nil
|
||||
}
|
||||
|
||||
selectunmasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
|
||||
rep := obiseq.NewBioSequenceSlice()
|
||||
|
||||
inhigh := false
|
||||
fromhigh := -1
|
||||
for i, masked := range maskPosition {
|
||||
if !masked && !inhigh {
|
||||
fromhigh = i
|
||||
inhigh = true
|
||||
}
|
||||
if inhigh && masked {
|
||||
if fromhigh >= 0 {
|
||||
frg, err := sequence.Subsequence(fromhigh, i, false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rep.Push(frg)
|
||||
}
|
||||
inhigh = false
|
||||
fromhigh = -1
|
||||
}
|
||||
}
|
||||
|
||||
if inhigh && fromhigh >= 0 {
|
||||
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rep.Push(frg)
|
||||
}
|
||||
|
||||
return *rep, nil
|
||||
}
|
||||
|
||||
masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
if sequence.Len() < kmer_size {
|
||||
sequence.SetAttribute("obilowmask_error", "Sequence too short")
|
||||
remove := make([]bool, sequence.Len())
|
||||
for i := range remove {
|
||||
remove[i] = true
|
||||
}
|
||||
return applyMaskMode(sequence, remove, maskChar)
|
||||
}
|
||||
|
||||
bseq := sequence.Sequence()
|
||||
|
||||
maskPositions := maskAmbiguities(bseq)
|
||||
|
||||
maskFlags := make([]int, len(bseq))
|
||||
entropies := make([]float64, len(bseq))
|
||||
for i := range entropies {
|
||||
entropies[i] = 4.0
|
||||
}
|
||||
|
||||
freqs := make([]int, 1<<(2*level_max))
|
||||
words := make([]int, len(bseq))
|
||||
entropies2 := make([]float64, len(bseq))
|
||||
|
||||
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max, normTables[level_max])
|
||||
|
||||
for i := range bseq {
|
||||
v := level_max
|
||||
maskFlags[i] = v
|
||||
}
|
||||
|
||||
for ws := level_max - 1; ws > 0; ws-- {
|
||||
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws, normTables[ws])
|
||||
for i, e2 := range entropies2 {
|
||||
if e2 < entropies[i] {
|
||||
entropies[i] = e2
|
||||
maskFlags[i] = ws
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i, nuc := range bseq {
|
||||
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||
entropies[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
remove := make([]bool, len(entropies))
|
||||
for i, e := range entropies {
|
||||
remove[i] = e <= threshold
|
||||
}
|
||||
|
||||
sequence.SetAttribute("mask", maskFlags)
|
||||
sequence.SetAttribute("Entropies", entropies)
|
||||
|
||||
switch mode {
|
||||
case MaskMode:
|
||||
return applyMaskMode(sequence, remove, maskChar)
|
||||
case SplitMode:
|
||||
return selectunmasked(sequence, remove)
|
||||
case ExtractMode:
|
||||
return selectMasked(sequence, remove)
|
||||
}
|
||||
return nil, fmt.Errorf("unknown mode %d", mode)
|
||||
}
|
||||
|
||||
return masking
|
||||
}
|
||||
|
||||
// runLowmask implements the "obik lowmask" subcommand.
|
||||
// It masks low-complexity regions in DNA sequences using entropy-based detection.
|
||||
func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
kmerSize := _lowmaskKmerSize
|
||||
levelMax := _lowmaskLevelMax
|
||||
threshold := _lowmaskThreshold
|
||||
mode := lowmaskMaskingMode()
|
||||
maskChar := lowmaskMaskingChar()
|
||||
|
||||
log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold)
|
||||
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open sequence files: %w", err)
|
||||
}
|
||||
|
||||
worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar)
|
||||
|
||||
masked := sequences.MakeIWorker(
|
||||
worker,
|
||||
false,
|
||||
obidefault.ParallelWorkers(),
|
||||
).FilterEmpty()
|
||||
|
||||
obiconvert.CLIWriteBioSequences(masked, true)
|
||||
obiutils.WaitForLastPipe()
|
||||
|
||||
return nil
|
||||
}
|
||||
96
pkg/obitools/obik/ls.go
Normal file
96
pkg/obitools/obik/ls.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
type setEntry struct {
|
||||
Index int `json:"index" yaml:"index"`
|
||||
ID string `json:"id" yaml:"id"`
|
||||
Count uint64 `json:"count" yaml:"count"`
|
||||
}
|
||||
|
||||
func runLs(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 1 {
|
||||
return fmt.Errorf("usage: obik ls [options] <index_directory>")
|
||||
}
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(args[0])
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open kmer index: %w", err)
|
||||
}
|
||||
|
||||
// Determine which sets to show
|
||||
patterns := CLISetPatterns()
|
||||
var indices []int
|
||||
if len(patterns) > 0 {
|
||||
indices, err = ksg.MatchSetIDs(patterns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
indices = make([]int, ksg.Size())
|
||||
for i := range indices {
|
||||
indices[i] = i
|
||||
}
|
||||
}
|
||||
|
||||
entries := make([]setEntry, len(indices))
|
||||
for i, idx := range indices {
|
||||
entries[i] = setEntry{
|
||||
Index: idx,
|
||||
ID: ksg.SetIDOf(idx),
|
||||
Count: ksg.Len(idx),
|
||||
}
|
||||
}
|
||||
|
||||
format := CLIOutFormat()
|
||||
switch format {
|
||||
case "json":
|
||||
return outputLsJSON(entries)
|
||||
case "yaml":
|
||||
return outputLsYAML(entries)
|
||||
case "csv":
|
||||
return outputLsCSV(entries)
|
||||
default:
|
||||
return outputLsCSV(entries)
|
||||
}
|
||||
}
|
||||
|
||||
func outputLsCSV(entries []setEntry) error {
|
||||
fmt.Println("index,id,count")
|
||||
for _, e := range entries {
|
||||
// Escape commas in ID if needed
|
||||
id := e.ID
|
||||
if strings.ContainsAny(id, ",\"") {
|
||||
id = "\"" + strings.ReplaceAll(id, "\"", "\"\"") + "\""
|
||||
}
|
||||
fmt.Printf("%d,%s,%d\n", e.Index, id, e.Count)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputLsJSON(entries []setEntry) error {
|
||||
data, err := json.MarshalIndent(entries, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println(string(data))
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputLsYAML(entries []setEntry) error {
|
||||
data, err := yaml.Marshal(entries)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Print(string(data))
|
||||
return nil
|
||||
}
|
||||
63
pkg/obitools/obik/mv.go
Normal file
63
pkg/obitools/obik/mv.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
func runMv(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 2 {
|
||||
return fmt.Errorf("usage: obik mv [--set PATTERN]... [--force] <source_index> <dest_index>")
|
||||
}
|
||||
|
||||
srcDir := args[0]
|
||||
destDir := args[1]
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(srcDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open source kmer index: %w", err)
|
||||
}
|
||||
|
||||
// Resolve set patterns
|
||||
patterns := CLISetPatterns()
|
||||
var ids []string
|
||||
if len(patterns) > 0 {
|
||||
indices, err := ksg.MatchSetIDs(patterns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(indices) == 0 {
|
||||
return fmt.Errorf("no sets match the given patterns")
|
||||
}
|
||||
ids = make([]string, len(indices))
|
||||
for i, idx := range indices {
|
||||
ids[i] = ksg.SetIDOf(idx)
|
||||
}
|
||||
} else {
|
||||
// Move all sets
|
||||
ids = ksg.SetsIDs()
|
||||
}
|
||||
|
||||
log.Infof("Moving %d set(s) from %s to %s", len(ids), srcDir, destDir)
|
||||
|
||||
// Copy first
|
||||
dest, err := ksg.CopySetsByIDTo(ids, destDir, CLIForce())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Remove from source (in reverse order to avoid renumbering issues)
|
||||
for i := len(ids) - 1; i >= 0; i-- {
|
||||
if err := ksg.RemoveSetByID(ids[i]); err != nil {
|
||||
return fmt.Errorf("failed to remove set %q from source after copy: %w", ids[i], err)
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Destination now has %d set(s), source has %d set(s)", dest.Size(), ksg.Size())
|
||||
return nil
|
||||
}
|
||||
64
pkg/obitools/obik/obik.go
Normal file
64
pkg/obitools/obik/obik.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// OptionSet registers all obik subcommands on the root GetOpt.
|
||||
func OptionSet(opt *getoptions.GetOpt) {
|
||||
// index: build or extend a kmer index from sequence files
|
||||
indexCmd := opt.NewCommand("index", "Build a disk-based kmer index from sequence files")
|
||||
obiconvert.InputOptionSet(indexCmd)
|
||||
obiconvert.OutputModeOptionSet(indexCmd, false)
|
||||
KmerIndexOptionSet(indexCmd)
|
||||
indexCmd.StringMapVar(&_setMetaTags, "tag", 1, 1,
|
||||
indexCmd.Alias("T"),
|
||||
indexCmd.ArgName("KEY=VALUE"),
|
||||
indexCmd.Description("Per-set metadata tag (repeatable)."))
|
||||
indexCmd.SetCommandFn(runIndex)
|
||||
|
||||
// ls: list sets in a kmer index
|
||||
lsCmd := opt.NewCommand("ls", "List sets in a kmer index")
|
||||
OutputFormatOptionSet(lsCmd)
|
||||
SetSelectionOptionSet(lsCmd)
|
||||
lsCmd.SetCommandFn(runLs)
|
||||
|
||||
// summary: detailed statistics
|
||||
summaryCmd := opt.NewCommand("summary", "Show detailed statistics of a kmer index")
|
||||
OutputFormatOptionSet(summaryCmd)
|
||||
summaryCmd.BoolVar(&_jaccard, "jaccard", false,
|
||||
summaryCmd.Description("Compute and display pairwise Jaccard distance matrix."))
|
||||
summaryCmd.SetCommandFn(runSummary)
|
||||
|
||||
// cp: copy sets between indices
|
||||
cpCmd := opt.NewCommand("cp", "Copy sets between kmer indices")
|
||||
SetSelectionOptionSet(cpCmd)
|
||||
ForceOptionSet(cpCmd)
|
||||
cpCmd.SetCommandFn(runCp)
|
||||
|
||||
// mv: move sets between indices
|
||||
mvCmd := opt.NewCommand("mv", "Move sets between kmer indices")
|
||||
SetSelectionOptionSet(mvCmd)
|
||||
ForceOptionSet(mvCmd)
|
||||
mvCmd.SetCommandFn(runMv)
|
||||
|
||||
// rm: remove sets from an index
|
||||
rmCmd := opt.NewCommand("rm", "Remove sets from a kmer index")
|
||||
SetSelectionOptionSet(rmCmd)
|
||||
rmCmd.SetCommandFn(runRm)
|
||||
|
||||
// super: extract super k-mers from sequences
|
||||
superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files")
|
||||
obiconvert.InputOptionSet(superCmd)
|
||||
obiconvert.OutputOptionSet(superCmd)
|
||||
SuperKmerOptionSet(superCmd)
|
||||
superCmd.SetCommandFn(runSuper)
|
||||
|
||||
// lowmask: mask low-complexity regions
|
||||
lowmaskCmd := opt.NewCommand("lowmask", "Mask low-complexity regions in sequences using entropy")
|
||||
obiconvert.InputOptionSet(lowmaskCmd)
|
||||
obiconvert.OutputOptionSet(lowmaskCmd)
|
||||
LowMaskOptionSet(lowmaskCmd)
|
||||
lowmaskCmd.SetCommandFn(runLowmask)
|
||||
}
|
||||
189
pkg/obitools/obik/options.go
Normal file
189
pkg/obitools/obik/options.go
Normal file
@@ -0,0 +1,189 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Output format flags
|
||||
var _jsonOutput bool
|
||||
var _csvOutput bool
|
||||
var _yamlOutput bool
|
||||
|
||||
// Set selection flags
|
||||
var _setPatterns []string
|
||||
|
||||
// Force flag
|
||||
var _force bool
|
||||
|
||||
// Jaccard flag
|
||||
var _jaccard bool
|
||||
|
||||
// Per-set tags for index subcommand
|
||||
var _setMetaTags = make(map[string]string, 0)
|
||||
|
||||
// ==============================
|
||||
// Kmer index building options (moved from obikindex)
|
||||
// ==============================
|
||||
|
||||
var _kmerSize = 31
|
||||
var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
|
||||
var _indexId = ""
|
||||
var _metadataFormat = "toml"
|
||||
var _setTag = make(map[string]string, 0)
|
||||
var _minOccurrence = 1
|
||||
var _saveFullFilter = false
|
||||
|
||||
// KmerIndexOptionSet defines every option related to kmer index building.
|
||||
func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
|
||||
options.Alias("k"),
|
||||
options.Description("Size of k-mers (must be between 2 and 31)."))
|
||||
|
||||
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
|
||||
options.Alias("m"),
|
||||
options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5))."))
|
||||
|
||||
options.StringVar(&_indexId, "index-id", _indexId,
|
||||
options.Description("Identifier for the kmer index."))
|
||||
|
||||
options.StringVar(&_metadataFormat, "metadata-format", _metadataFormat,
|
||||
options.Description("Format for metadata file (toml, yaml, json)."))
|
||||
|
||||
options.StringMapVar(&_setTag, "set-tag", 1, 1,
|
||||
options.Alias("S"),
|
||||
options.ArgName("KEY=VALUE"),
|
||||
options.Description("Adds a group-level metadata attribute KEY with value VALUE."))
|
||||
|
||||
options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
|
||||
options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
|
||||
|
||||
options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter,
|
||||
options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the k-mer size.
|
||||
func CLIKmerSize() int {
|
||||
return _kmerSize
|
||||
}
|
||||
|
||||
// CLIMinimizerSize returns the effective minimizer size.
|
||||
func CLIMinimizerSize() int {
|
||||
m := _minimizerSize
|
||||
if m < 0 {
|
||||
m = obikmer.DefaultMinimizerSize(_kmerSize)
|
||||
}
|
||||
nworkers := obidefault.ParallelWorkers()
|
||||
m = obikmer.ValidateMinimizerSize(m, _kmerSize, nworkers)
|
||||
return m
|
||||
}
|
||||
|
||||
// CLIIndexId returns the index identifier.
|
||||
func CLIIndexId() string {
|
||||
return _indexId
|
||||
}
|
||||
|
||||
// CLIMetadataFormat returns the metadata format.
|
||||
func CLIMetadataFormat() obikmer.MetadataFormat {
|
||||
switch strings.ToLower(_metadataFormat) {
|
||||
case "toml":
|
||||
return obikmer.FormatTOML
|
||||
case "yaml":
|
||||
return obikmer.FormatYAML
|
||||
case "json":
|
||||
return obikmer.FormatJSON
|
||||
default:
|
||||
log.Warnf("Unknown metadata format %q, defaulting to TOML", _metadataFormat)
|
||||
return obikmer.FormatTOML
|
||||
}
|
||||
}
|
||||
|
||||
// CLISetTag returns the group-level metadata key=value pairs.
|
||||
func CLISetTag() map[string]string {
|
||||
return _setTag
|
||||
}
|
||||
|
||||
// CLIMinOccurrence returns the minimum occurrence threshold.
|
||||
func CLIMinOccurrence() int {
|
||||
return _minOccurrence
|
||||
}
|
||||
|
||||
// CLISaveFullFilter returns whether to save the full frequency filter.
|
||||
func CLISaveFullFilter() bool {
|
||||
return _saveFullFilter
|
||||
}
|
||||
|
||||
// CLIOutputDirectory returns the output directory path.
|
||||
func CLIOutputDirectory() string {
|
||||
return obiconvert.CLIOutPutFileName()
|
||||
}
|
||||
|
||||
// SetKmerSize sets the k-mer size (for testing).
|
||||
func SetKmerSize(k int) {
|
||||
_kmerSize = k
|
||||
}
|
||||
|
||||
// SetMinimizerSize sets the minimizer size (for testing).
|
||||
func SetMinimizerSize(m int) {
|
||||
_minimizerSize = m
|
||||
}
|
||||
|
||||
// SetMinOccurrence sets the minimum occurrence (for testing).
|
||||
func SetMinOccurrence(n int) {
|
||||
_minOccurrence = n
|
||||
}
|
||||
|
||||
// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
|
||||
func OutputFormatOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_jsonOutput, "json-output", false,
|
||||
options.Description("Print results as JSON."))
|
||||
options.BoolVar(&_csvOutput, "csv-output", false,
|
||||
options.Description("Print results as CSV."))
|
||||
options.BoolVar(&_yamlOutput, "yaml-output", false,
|
||||
options.Description("Print results as YAML."))
|
||||
}
|
||||
|
||||
// CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text".
|
||||
func CLIOutFormat() string {
|
||||
if _jsonOutput {
|
||||
return "json"
|
||||
}
|
||||
if _csvOutput {
|
||||
return "csv"
|
||||
}
|
||||
if _yamlOutput {
|
||||
return "yaml"
|
||||
}
|
||||
return "text"
|
||||
}
|
||||
|
||||
// SetSelectionOptionSet registers --set <glob_pattern> (repeatable).
|
||||
func SetSelectionOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringSliceVar(&_setPatterns, "set", 1, 1,
|
||||
options.Alias("s"),
|
||||
options.ArgName("PATTERN"),
|
||||
options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...])."))
|
||||
}
|
||||
|
||||
// CLISetPatterns returns the --set patterns provided by the user.
|
||||
func CLISetPatterns() []string {
|
||||
return _setPatterns
|
||||
}
|
||||
|
||||
// ForceOptionSet registers --force / -f.
|
||||
func ForceOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_force, "force", false,
|
||||
options.Alias("f"),
|
||||
options.Description("Force operation even if set ID already exists in destination."))
|
||||
}
|
||||
|
||||
// CLIForce returns whether --force was specified.
|
||||
func CLIForce() bool {
|
||||
return _force
|
||||
}
|
||||
56
pkg/obitools/obik/rm.go
Normal file
56
pkg/obitools/obik/rm.go
Normal file
@@ -0,0 +1,56 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
func runRm(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 1 {
|
||||
return fmt.Errorf("usage: obik rm --set PATTERN [--set PATTERN]... <index_directory>")
|
||||
}
|
||||
|
||||
patterns := CLISetPatterns()
|
||||
if len(patterns) == 0 {
|
||||
return fmt.Errorf("--set is required (specify which sets to remove)")
|
||||
}
|
||||
|
||||
indexDir := args[0]
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(indexDir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open kmer index: %w", err)
|
||||
}
|
||||
|
||||
indices, err := ksg.MatchSetIDs(patterns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(indices) == 0 {
|
||||
return fmt.Errorf("no sets match the given patterns")
|
||||
}
|
||||
|
||||
// Collect IDs before removal (indices shift as we remove)
|
||||
ids := make([]string, len(indices))
|
||||
for i, idx := range indices {
|
||||
ids[i] = ksg.SetIDOf(idx)
|
||||
}
|
||||
|
||||
log.Infof("Removing %d set(s) from %s", len(ids), indexDir)
|
||||
|
||||
// Remove in reverse order to avoid renumbering issues
|
||||
for i := len(ids) - 1; i >= 0; i-- {
|
||||
if err := ksg.RemoveSetByID(ids[i]); err != nil {
|
||||
return fmt.Errorf("failed to remove set %q: %w", ids[i], err)
|
||||
}
|
||||
log.Infof("Removed set %q", ids[i])
|
||||
}
|
||||
|
||||
log.Infof("Index now has %d set(s)", ksg.Size())
|
||||
return nil
|
||||
}
|
||||
148
pkg/obitools/obik/summary.go
Normal file
148
pkg/obitools/obik/summary.go
Normal file
@@ -0,0 +1,148 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
type setSummary struct {
|
||||
Index int `json:"index" yaml:"index"`
|
||||
ID string `json:"id" yaml:"id"`
|
||||
Count uint64 `json:"count" yaml:"count"`
|
||||
DiskSize int64 `json:"disk_bytes" yaml:"disk_bytes"`
|
||||
Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type groupSummary struct {
|
||||
Path string `json:"path" yaml:"path"`
|
||||
ID string `json:"id,omitempty" yaml:"id,omitempty"`
|
||||
K int `json:"k" yaml:"k"`
|
||||
M int `json:"m" yaml:"m"`
|
||||
Partitions int `json:"partitions" yaml:"partitions"`
|
||||
TotalSets int `json:"total_sets" yaml:"total_sets"`
|
||||
TotalKmers uint64 `json:"total_kmers" yaml:"total_kmers"`
|
||||
TotalDisk int64 `json:"total_disk_bytes" yaml:"total_disk_bytes"`
|
||||
Metadata map[string]interface{} `json:"metadata,omitempty" yaml:"metadata,omitempty"`
|
||||
Sets []setSummary `json:"sets" yaml:"sets"`
|
||||
Jaccard [][]float64 `json:"jaccard,omitempty" yaml:"jaccard,omitempty"`
|
||||
}
|
||||
|
||||
func runSummary(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 1 {
|
||||
return fmt.Errorf("usage: obik summary [options] <index_directory>")
|
||||
}
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(args[0])
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open kmer index: %w", err)
|
||||
}
|
||||
|
||||
summary := groupSummary{
|
||||
Path: ksg.Path(),
|
||||
ID: ksg.Id(),
|
||||
K: ksg.K(),
|
||||
M: ksg.M(),
|
||||
Partitions: ksg.Partitions(),
|
||||
TotalSets: ksg.Size(),
|
||||
TotalKmers: ksg.Len(),
|
||||
Metadata: ksg.Metadata,
|
||||
Sets: make([]setSummary, ksg.Size()),
|
||||
}
|
||||
|
||||
var totalDisk int64
|
||||
for i := 0; i < ksg.Size(); i++ {
|
||||
diskSize := computeSetDiskSize(ksg, i)
|
||||
totalDisk += diskSize
|
||||
summary.Sets[i] = setSummary{
|
||||
Index: i,
|
||||
ID: ksg.SetIDOf(i),
|
||||
Count: ksg.Len(i),
|
||||
DiskSize: diskSize,
|
||||
Metadata: ksg.AllSetMetadata(i),
|
||||
}
|
||||
}
|
||||
summary.TotalDisk = totalDisk
|
||||
|
||||
// Jaccard matrix
|
||||
if _jaccard && ksg.Size() > 1 {
|
||||
dm := ksg.JaccardDistanceMatrix()
|
||||
n := ksg.Size()
|
||||
matrix := make([][]float64, n)
|
||||
for i := 0; i < n; i++ {
|
||||
matrix[i] = make([]float64, n)
|
||||
for j := 0; j < n; j++ {
|
||||
if i == j {
|
||||
matrix[i][j] = 0
|
||||
} else {
|
||||
matrix[i][j] = dm.Get(i, j)
|
||||
}
|
||||
}
|
||||
}
|
||||
summary.Jaccard = matrix
|
||||
}
|
||||
|
||||
format := CLIOutFormat()
|
||||
switch format {
|
||||
case "json":
|
||||
return outputSummaryJSON(summary)
|
||||
case "yaml":
|
||||
return outputSummaryYAML(summary)
|
||||
case "csv":
|
||||
return outputSummaryCSV(summary)
|
||||
default:
|
||||
return outputSummaryJSON(summary)
|
||||
}
|
||||
}
|
||||
|
||||
func computeSetDiskSize(ksg *obikmer.KmerSetGroup, setIndex int) int64 {
|
||||
var total int64
|
||||
for p := 0; p < ksg.Partitions(); p++ {
|
||||
path := ksg.PartitionPath(setIndex, p)
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
total += info.Size()
|
||||
}
|
||||
// Also count the set directory entry itself
|
||||
setDir := filepath.Join(ksg.Path(), fmt.Sprintf("set_%d", setIndex))
|
||||
entries, err := os.ReadDir(setDir)
|
||||
if err == nil {
|
||||
// We already counted .kdi files above; this is just for completeness
|
||||
_ = entries
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
func outputSummaryJSON(summary groupSummary) error {
|
||||
data, err := json.MarshalIndent(summary, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println(string(data))
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputSummaryYAML(summary groupSummary) error {
|
||||
data, err := yaml.Marshal(summary)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Print(string(data))
|
||||
return nil
|
||||
}
|
||||
|
||||
func outputSummaryCSV(summary groupSummary) error {
|
||||
fmt.Println("index,id,count,disk_bytes")
|
||||
for _, s := range summary.Sets {
|
||||
fmt.Printf("%d,%s,%d,%d\n", s.Index, s.ID, s.Count, s.DiskSize)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
64
pkg/obitools/obik/super.go
Normal file
64
pkg/obitools/obik/super.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Super k-mer specific option variables.
|
||||
// These reuse _kmerSize and _minimizerSize from options.go since
|
||||
// only one subcommand runs at a time.
|
||||
|
||||
// SuperKmerOptionSet registers options specific to super k-mer extraction.
|
||||
func SuperKmerOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
|
||||
options.Alias("k"),
|
||||
options.Description("Size of k-mers (must be between m+1 and 31)."))
|
||||
|
||||
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
|
||||
options.Alias("m"),
|
||||
options.Description("Size of minimizers (must be between 1 and k-1)."))
|
||||
}
|
||||
|
||||
// runSuper implements the "obik super" subcommand.
|
||||
// It extracts super k-mers from DNA sequences.
|
||||
func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
k := _kmerSize
|
||||
m := _minimizerSize
|
||||
|
||||
if k < 2 || k > 31 {
|
||||
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||
}
|
||||
|
||||
if m < 1 || m >= k {
|
||||
return fmt.Errorf("invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
|
||||
}
|
||||
|
||||
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
|
||||
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open sequence files: %w", err)
|
||||
}
|
||||
|
||||
worker := obikmer.SuperKmerWorker(k, m)
|
||||
|
||||
superkmers := sequences.MakeIWorker(
|
||||
worker,
|
||||
false,
|
||||
obidefault.ParallelWorkers(),
|
||||
)
|
||||
|
||||
obiconvert.CLIWriteBioSequences(superkmers, true)
|
||||
obiutils.WaitForLastPipe()
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user