mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add obisuperkmer command implementation and tests
This commit adds the implementation of the obisuperkmer command, including: - The main command in cmd/obitools/obisuperkmer/ - The package implementation in pkg/obitools/obisuperkmer/ - Automated tests in obitests/obitools/obisuperkmer/ - Documentation for the implementation and tests The obisuperkmer command extracts super k-mers from DNA sequences, following the standard OBITools architecture. It includes proper CLI option handling, validation of parameters, and integration with the OBITools pipeline system. Tests cover basic functionality, parameter validation, output format, metadata preservation, and file I/O operations.
This commit is contained in:
10
pkg/obitools/obisuperkmer/obisuperkmer.go
Normal file
10
pkg/obitools/obisuperkmer/obisuperkmer.go
Normal file
@@ -0,0 +1,10 @@
|
||||
// obisuperkmer function utility package.
|
||||
//
|
||||
// The obitools/obisuperkmer package contains every
|
||||
// function specifically required by the obisuperkmer utility.
|
||||
//
|
||||
// The obisuperkmer command extracts super k-mers from DNA sequences.
|
||||
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||||
// share the same minimizer. This decomposition is useful for efficient
|
||||
// k-mer indexing and analysis in bioinformatics applications.
|
||||
package obisuperkmer
|
||||
69
pkg/obitools/obisuperkmer/options.go
Normal file
69
pkg/obitools/obisuperkmer/options.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package obisuperkmer
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Private variables for storing option values
|
||||
var _KmerSize = 31
|
||||
var _MinimizerSize = 13
|
||||
|
||||
// SuperKmerOptionSet defines every option related to super k-mer extraction.
|
||||
//
|
||||
// The function adds to a CLI every option proposed to the user
|
||||
// to tune the parameters of the super k-mer extraction algorithm.
|
||||
//
|
||||
// Parameters:
|
||||
// - options: is a pointer to a getoptions.GetOpt instance normally
|
||||
// produced by the obioptions.GenerateOptionParser function.
|
||||
func SuperKmerOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_KmerSize, "kmer-size", _KmerSize,
|
||||
options.Alias("k"),
|
||||
options.Description("Size of k-mers (must be between m+1 and 31)."))
|
||||
|
||||
options.IntVar(&_MinimizerSize, "minimizer-size", _MinimizerSize,
|
||||
options.Alias("m"),
|
||||
options.Description("Size of minimizers (must be between 1 and k-1)."))
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every option declared for
|
||||
// the obisuperkmer command.
|
||||
//
|
||||
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(false)(options)
|
||||
SuperKmerOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the k-mer size to use for super k-mer extraction.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns an integer representing the k-mer size.
|
||||
func CLIKmerSize() int {
|
||||
return _KmerSize
|
||||
}
|
||||
|
||||
// SetKmerSize sets the k-mer size for super k-mer extraction.
|
||||
//
|
||||
// Parameters:
|
||||
// - k: the k-mer size (must be between m+1 and 31).
|
||||
func SetKmerSize(k int) {
|
||||
_KmerSize = k
|
||||
}
|
||||
|
||||
// CLIMinimizerSize returns the minimizer size to use for super k-mer extraction.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns an integer representing the minimizer size.
|
||||
func CLIMinimizerSize() int {
|
||||
return _MinimizerSize
|
||||
}
|
||||
|
||||
// SetMinimizerSize sets the minimizer size for super k-mer extraction.
|
||||
//
|
||||
// Parameters:
|
||||
// - m: the minimizer size (must be between 1 and k-1).
|
||||
func SetMinimizerSize(m int) {
|
||||
_MinimizerSize = m
|
||||
}
|
||||
59
pkg/obitools/obisuperkmer/superkmer.go
Normal file
59
pkg/obitools/obisuperkmer/superkmer.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package obisuperkmer
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
)
|
||||
|
||||
// CLIExtractSuperKmers extracts super k-mers from an iterator of BioSequences.
|
||||
//
|
||||
// This function takes an iterator of BioSequence objects, extracts super k-mers
|
||||
// from each sequence using the k-mer and minimizer sizes specified by CLI options,
|
||||
// and returns a new iterator yielding the extracted super k-mers as BioSequence objects.
|
||||
//
|
||||
// Each super k-mer is a maximal subsequence where all consecutive k-mers share
|
||||
// the same minimizer. The resulting BioSequences contain metadata including:
|
||||
// - minimizer_value: the canonical minimizer value
|
||||
// - minimizer_seq: the DNA sequence of the minimizer
|
||||
// - k: the k-mer size used
|
||||
// - m: the minimizer size used
|
||||
// - start: starting position in the original sequence
|
||||
// - end: ending position in the original sequence
|
||||
// - parent_id: ID of the parent sequence
|
||||
//
|
||||
// Parameters:
|
||||
// - iterator: an iterator yielding BioSequence objects to process.
|
||||
//
|
||||
// Returns:
|
||||
// - An iterator yielding BioSequence objects representing super k-mers.
|
||||
func CLIExtractSuperKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
// Get k-mer and minimizer sizes from CLI options
|
||||
k := CLIKmerSize()
|
||||
m := CLIMinimizerSize()
|
||||
|
||||
// Validate parameters
|
||||
if m < 1 || m >= k {
|
||||
log.Fatalf("Invalid parameters: minimizer size (%d) must be between 1 and k-1 (%d)", m, k-1)
|
||||
}
|
||||
|
||||
if k < 2 || k > 31 {
|
||||
log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||
}
|
||||
|
||||
log.Printf("Extracting super k-mers with k=%d, m=%d", k, m)
|
||||
|
||||
// Create the worker for super k-mer extraction
|
||||
worker := obikmer.SuperKmerWorker(k, m)
|
||||
|
||||
// Apply the worker to the iterator with parallel processing
|
||||
newIter := iterator.MakeIWorker(
|
||||
worker,
|
||||
false, // don't merge results
|
||||
obidefault.ParallelWorkers(),
|
||||
)
|
||||
|
||||
return newIter
|
||||
}
|
||||
149
pkg/obitools/obisuperkmer/superkmer_test.go
Normal file
149
pkg/obitools/obisuperkmer/superkmer_test.go
Normal file
@@ -0,0 +1,149 @@
|
||||
package obisuperkmer
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
func TestCLIExtractSuperKmers(t *testing.T) {
|
||||
// Create a test sequence
|
||||
testSeq := obiseq.NewBioSequence(
|
||||
"test_seq",
|
||||
[]byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
|
||||
"",
|
||||
)
|
||||
|
||||
// Create a batch with the test sequence
|
||||
batch := obiseq.NewBioSequenceBatch()
|
||||
batch.Add(testSeq)
|
||||
|
||||
// Create an iterator from the batch
|
||||
iterator := obiiter.MakeBioSequenceBatchChannel(1)
|
||||
go func() {
|
||||
iterator.Push(batch)
|
||||
iterator.Close()
|
||||
}()
|
||||
|
||||
// Set test parameters
|
||||
SetKmerSize(15)
|
||||
SetMinimizerSize(7)
|
||||
|
||||
// Extract super k-mers
|
||||
result := CLIExtractSuperKmers(iterator)
|
||||
|
||||
// Count the number of super k-mers
|
||||
count := 0
|
||||
for result.Next() {
|
||||
batch := result.Get()
|
||||
for _, sk := range batch.Slice() {
|
||||
count++
|
||||
|
||||
// Verify that the super k-mer has the expected attributes
|
||||
if !sk.HasAttribute("minimizer_value") {
|
||||
t.Error("Super k-mer missing 'minimizer_value' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("minimizer_seq") {
|
||||
t.Error("Super k-mer missing 'minimizer_seq' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("k") {
|
||||
t.Error("Super k-mer missing 'k' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("m") {
|
||||
t.Error("Super k-mer missing 'm' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("start") {
|
||||
t.Error("Super k-mer missing 'start' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("end") {
|
||||
t.Error("Super k-mer missing 'end' attribute")
|
||||
}
|
||||
if !sk.HasAttribute("parent_id") {
|
||||
t.Error("Super k-mer missing 'parent_id' attribute")
|
||||
}
|
||||
|
||||
// Verify attribute values
|
||||
k, _ := sk.GetIntAttribute("k")
|
||||
m, _ := sk.GetIntAttribute("m")
|
||||
|
||||
if k != 15 {
|
||||
t.Errorf("Expected k=15, got k=%d", k)
|
||||
}
|
||||
if m != 7 {
|
||||
t.Errorf("Expected m=7, got m=%d", m)
|
||||
}
|
||||
|
||||
parentID, _ := sk.GetStringAttribute("parent_id")
|
||||
if parentID != "test_seq" {
|
||||
t.Errorf("Expected parent_id='test_seq', got '%s'", parentID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if count == 0 {
|
||||
t.Error("No super k-mers were extracted")
|
||||
}
|
||||
|
||||
t.Logf("Extracted %d super k-mers from test sequence", count)
|
||||
}
|
||||
|
||||
func TestOptionGettersAndSetters(t *testing.T) {
|
||||
// Test initial values
|
||||
if CLIKmerSize() != 21 {
|
||||
t.Errorf("Expected default k-mer size 21, got %d", CLIKmerSize())
|
||||
}
|
||||
if CLIMinimizerSize() != 11 {
|
||||
t.Errorf("Expected default minimizer size 11, got %d", CLIMinimizerSize())
|
||||
}
|
||||
|
||||
// Test setters
|
||||
SetKmerSize(25)
|
||||
SetMinimizerSize(13)
|
||||
|
||||
if CLIKmerSize() != 25 {
|
||||
t.Errorf("SetKmerSize failed: expected 25, got %d", CLIKmerSize())
|
||||
}
|
||||
if CLIMinimizerSize() != 13 {
|
||||
t.Errorf("SetMinimizerSize failed: expected 13, got %d", CLIMinimizerSize())
|
||||
}
|
||||
|
||||
// Reset to defaults
|
||||
SetKmerSize(21)
|
||||
SetMinimizerSize(11)
|
||||
}
|
||||
|
||||
func BenchmarkCLIExtractSuperKmers(b *testing.B) {
|
||||
// Create a longer test sequence
|
||||
longSeq := make([]byte, 1000)
|
||||
bases := []byte{'A', 'C', 'G', 'T'}
|
||||
for i := range longSeq {
|
||||
longSeq[i] = bases[i%4]
|
||||
}
|
||||
|
||||
testSeq := obiseq.NewBioSequence("bench_seq", longSeq, "")
|
||||
|
||||
// Set parameters
|
||||
SetKmerSize(21)
|
||||
SetMinimizerSize(11)
|
||||
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
batch := obiseq.NewBioSequenceBatch()
|
||||
batch.Add(testSeq)
|
||||
|
||||
iterator := obiiter.MakeBioSequenceBatchChannel(1)
|
||||
go func() {
|
||||
iterator.Push(batch)
|
||||
iterator.Close()
|
||||
}()
|
||||
|
||||
result := CLIExtractSuperKmers(iterator)
|
||||
|
||||
// Consume the iterator
|
||||
for result.Next() {
|
||||
result.Get()
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user