Add obisuperkmer command implementation and tests

This commit adds the implementation of the obisuperkmer command, including:

- The main command in cmd/obitools/obisuperkmer/
- The package implementation in pkg/obitools/obisuperkmer/
- Automated tests in obitests/obitools/obisuperkmer/
- Documentation for the implementation and tests

The obisuperkmer command extracts super k-mers from DNA sequences, following the standard OBITools architecture. It includes proper CLI option handling, validation of parameters, and integration with the OBITools pipeline system.

Tests cover basic functionality, parameter validation, output format, metadata preservation, and file I/O operations.
This commit is contained in:
Eric Coissac
2026-02-07 13:53:13 +01:00
parent 00c8be6b48
commit 7a979ba77f
11 changed files with 1755 additions and 0 deletions

View File

@@ -0,0 +1,10 @@
// obisuperkmer function utility package.
//
// The obitools/obisuperkmer package contains every
// function specifically required by the obisuperkmer utility.
//
// The obisuperkmer command extracts super k-mers from DNA sequences.
// A super k-mer is a maximal subsequence where all consecutive k-mers
// share the same minimizer. This decomposition is useful for efficient
// k-mer indexing and analysis in bioinformatics applications.
package obisuperkmer

View File

@@ -0,0 +1,69 @@
package obisuperkmer
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
// Private variables for storing option values
var _KmerSize = 31
var _MinimizerSize = 13
// SuperKmerOptionSet defines every option related to super k-mer extraction.
//
// The function adds to a CLI every option proposed to the user
// to tune the parameters of the super k-mer extraction algorithm.
//
// Parameters:
// - options: is a pointer to a getoptions.GetOpt instance normally
// produced by the obioptions.GenerateOptionParser function.
func SuperKmerOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_KmerSize, "kmer-size", _KmerSize,
options.Alias("k"),
options.Description("Size of k-mers (must be between m+1 and 31)."))
options.IntVar(&_MinimizerSize, "minimizer-size", _MinimizerSize,
options.Alias("m"),
options.Description("Size of minimizers (must be between 1 and k-1)."))
}
// OptionSet adds to the basic option set every option declared for
// the obisuperkmer command.
//
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(false)(options)
SuperKmerOptionSet(options)
}
// CLIKmerSize returns the k-mer size to use for super k-mer extraction.
//
// It does not take any parameters.
// It returns an integer representing the k-mer size.
func CLIKmerSize() int {
return _KmerSize
}
// SetKmerSize sets the k-mer size for super k-mer extraction.
//
// Parameters:
// - k: the k-mer size (must be between m+1 and 31).
func SetKmerSize(k int) {
_KmerSize = k
}
// CLIMinimizerSize returns the minimizer size to use for super k-mer extraction.
//
// It does not take any parameters.
// It returns an integer representing the minimizer size.
func CLIMinimizerSize() int {
return _MinimizerSize
}
// SetMinimizerSize sets the minimizer size for super k-mer extraction.
//
// Parameters:
// - m: the minimizer size (must be between 1 and k-1).
func SetMinimizerSize(m int) {
_MinimizerSize = m
}

View File

@@ -0,0 +1,59 @@
package obisuperkmer
import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
)
// CLIExtractSuperKmers extracts super k-mers from an iterator of BioSequences.
//
// This function takes an iterator of BioSequence objects, extracts super k-mers
// from each sequence using the k-mer and minimizer sizes specified by CLI options,
// and returns a new iterator yielding the extracted super k-mers as BioSequence objects.
//
// Each super k-mer is a maximal subsequence where all consecutive k-mers share
// the same minimizer. The resulting BioSequences contain metadata including:
// - minimizer_value: the canonical minimizer value
// - minimizer_seq: the DNA sequence of the minimizer
// - k: the k-mer size used
// - m: the minimizer size used
// - start: starting position in the original sequence
// - end: ending position in the original sequence
// - parent_id: ID of the parent sequence
//
// Parameters:
// - iterator: an iterator yielding BioSequence objects to process.
//
// Returns:
// - An iterator yielding BioSequence objects representing super k-mers.
func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
// Get k-mer and minimizer sizes from CLI options
k := CLIKmerSize()
m := CLIMinimizerSize()
// Validate parameters
if m < 1 || m >= k {
log.Fatalf("Invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
}
if k < 2 || k > 31 {
log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
}
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
// Create the worker for super k-mer extraction
worker := obikmer.SuperKmerWorker(k, m)
// Apply the worker to the iterator with parallel processing
newIter := iterator.MakeIWorker(
worker,
false, // don't merge results
obidefault.ParallelWorkers(),
)
return newIter
}

View File

@@ -0,0 +1,149 @@
package obisuperkmer
import (
"testing"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
func TestCLIExtractSuperKmers(t *testing.T) {
// Create a test sequence
testSeq := obiseq.NewBioSequence(
"test_seq",
[]byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
"",
)
// Create a batch with the test sequence
batch := obiseq.NewBioSequenceBatch()
batch.Add(testSeq)
// Create an iterator from the batch
iterator := obiiter.MakeBioSequenceBatchChannel(1)
go func() {
iterator.Push(batch)
iterator.Close()
}()
// Set test parameters
SetKmerSize(15)
SetMinimizerSize(7)
// Extract super k-mers
result := CLIExtractSuperKmers(iterator)
// Count the number of super k-mers
count := 0
for result.Next() {
batch := result.Get()
for _, sk := range batch.Slice() {
count++
// Verify that the super k-mer has the expected attributes
if !sk.HasAttribute("minimizer_value") {
t.Error("Super k-mer missing 'minimizer_value' attribute")
}
if !sk.HasAttribute("minimizer_seq") {
t.Error("Super k-mer missing 'minimizer_seq' attribute")
}
if !sk.HasAttribute("k") {
t.Error("Super k-mer missing 'k' attribute")
}
if !sk.HasAttribute("m") {
t.Error("Super k-mer missing 'm' attribute")
}
if !sk.HasAttribute("start") {
t.Error("Super k-mer missing 'start' attribute")
}
if !sk.HasAttribute("end") {
t.Error("Super k-mer missing 'end' attribute")
}
if !sk.HasAttribute("parent_id") {
t.Error("Super k-mer missing 'parent_id' attribute")
}
// Verify attribute values
k, _ := sk.GetIntAttribute("k")
m, _ := sk.GetIntAttribute("m")
if k != 15 {
t.Errorf("Expected k=15, got k=%d", k)
}
if m != 7 {
t.Errorf("Expected m=7, got m=%d", m)
}
parentID, _ := sk.GetStringAttribute("parent_id")
if parentID != "test_seq" {
t.Errorf("Expected parent_id='test_seq', got '%s'", parentID)
}
}
}
if count == 0 {
t.Error("No super k-mers were extracted")
}
t.Logf("Extracted %d super k-mers from test sequence", count)
}
func TestOptionGettersAndSetters(t *testing.T) {
// Test initial values
if CLIKmerSize() != 21 {
t.Errorf("Expected default k-mer size 21, got %d", CLIKmerSize())
}
if CLIMinimizerSize() != 11 {
t.Errorf("Expected default minimizer size 11, got %d", CLIMinimizerSize())
}
// Test setters
SetKmerSize(25)
SetMinimizerSize(13)
if CLIKmerSize() != 25 {
t.Errorf("SetKmerSize failed: expected 25, got %d", CLIKmerSize())
}
if CLIMinimizerSize() != 13 {
t.Errorf("SetMinimizerSize failed: expected 13, got %d", CLIMinimizerSize())
}
// Reset to defaults
SetKmerSize(21)
SetMinimizerSize(11)
}
func BenchmarkCLIExtractSuperKmers(b *testing.B) {
// Create a longer test sequence
longSeq := make([]byte, 1000)
bases := []byte{'A', 'C', 'G', 'T'}
for i := range longSeq {
longSeq[i] = bases[i%4]
}
testSeq := obiseq.NewBioSequence("bench_seq", longSeq, "")
// Set parameters
SetKmerSize(21)
SetMinimizerSize(11)
b.ResetTimer()
for i := 0; i < b.N; i++ {
batch := obiseq.NewBioSequenceBatch()
batch.Add(testSeq)
iterator := obiiter.MakeBioSequenceBatchChannel(1)
go func() {
iterator.Push(batch)
iterator.Close()
}()
result := CLIExtractSuperKmers(iterator)
// Consume the iterator
for result.Next() {
result.Get()
}
}
}