Refactor k-mer matching pipeline with improved concurrency and memory management

Refactor k-mer matching to use a pipeline architecture with improved concurrency and memory management: - Replace sort.Slice with slices.SortFunc and cmp.Compare for better performance - Introduce PreparedQueries struct to encapsulate query buckets with metadata - Implement MergeQueries function to merge query buckets from multiple batches - Rewrite MatchBatch to use pre-allocated results and mutexes instead of map-based accumulation - Add seek optimization in matchPartition to reduce linear scanning - Refactor match command to use a multi-stage pipeline with proper batching and merging - Add index directory option for match command - Improve parallel processing of sequence batches This refactoring improves performance by reducing memory allocations, optimizing k-mer lookup, and implementing a more efficient pipeline for large-scale k-mer matching operations.
2026-06-24 17:51:00 +00:00 · 2026-02-10 22:10:22 +01:00
parent bebbbbfe7d
commit ac41dd8a22
5 changed files with 311 additions and 108 deletions
@@ -1,7 +1,8 @@
 package obikmer
 import (
-	"sort"
+	"cmp"
 	"slices"
 	"sync"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
@@ -12,23 +13,73 @@ import (
 type QueryEntry struct {
 	Kmer   uint64 // canonical k-mer value
 	SeqIdx int    // index within the batch
-	Pos    int    // 0-based position in the sequence
+	Pos    int    // 1-based position in the sequence
 }
-// MatchResult maps sequence index → sorted slice of matched positions.
+// MatchResult holds matched positions for each sequence in a batch.
-type MatchResult map[int][]int
+// results[i] contains the sorted matched positions for sequence i.
 type MatchResult [][]int
-// seqMatchResult collects matched positions for a single sequence.
+// PreparedQueries holds pre-computed query buckets along with the number
-type seqMatchResult struct {
+// of sequences they were built from. This is used by the accumulation
-	mu        sync.Mutex
+// pipeline to merge queries from multiple batches.
-	positions []int
+type PreparedQueries struct {
 	Buckets [][]QueryEntry // queries[partition], each sorted by Kmer
 	NSeqs   int            // number of sequences that produced these queries
 	NKmers  int            // total number of k-mer entries across all partitions
 }
 // MergeQueries merges src into dst, offsetting all SeqIdx values in src
 // by dst.NSeqs. Both dst and src must have the same number of partitions.
 // After merging, src should not be reused.
 //
 // Each partition's entries are merged in sorted order (merge-sort of two
 // already-sorted slices).
 func MergeQueries(dst, src *PreparedQueries) {
 	for p := range dst.Buckets {
 		if len(src.Buckets[p]) == 0 {
 			continue
 		}
 		offset := dst.NSeqs
 		srcB := src.Buckets[p]
 		// Offset SeqIdx in src entries
 		for i := range srcB {
 			srcB[i].SeqIdx += offset
 		}
 		if len(dst.Buckets[p]) == 0 {
 			dst.Buckets[p] = srcB
 			continue
 		}
 		// Merge two sorted slices
 		dstB := dst.Buckets[p]
 		merged := make([]QueryEntry, 0, len(dstB)+len(srcB))
 		i, j := 0, 0
 		for i < len(dstB) && j < len(srcB) {
 			if dstB[i].Kmer <= srcB[j].Kmer {
 				merged = append(merged, dstB[i])
 				i++
 			} else {
 				merged = append(merged, srcB[j])
 				j++
 			}
 		}
 		merged = append(merged, dstB[i:]...)
 		merged = append(merged, srcB[j:]...)
 		dst.Buckets[p] = merged
 	}
 	dst.NSeqs += src.NSeqs
 	dst.NKmers += src.NKmers
 }
 // PrepareQueries extracts all canonical k-mers from a batch of sequences
 // and groups them by partition using super-kmer minimizers.
 //
-// Returns queries[partition] where each slice is sorted by Kmer value.
+// Returns a PreparedQueries with sorted per-partition buckets.
-func (ksg *KmerSetGroup) PrepareQueries(sequences []*obiseq.BioSequence) [][]QueryEntry {
+func (ksg *KmerSetGroup) PrepareQueries(sequences []*obiseq.BioSequence) *PreparedQueries {
 	P := ksg.partitions
 	k := ksg.k
 	m := ksg.m
@@ -39,6 +90,7 @@ func (ksg *KmerSetGroup) PrepareQueries(sequences []*obiseq.BioSequence) [][]Que
 		buckets[i] = make([]QueryEntry, 0, 64)
 	}
 	totalKmers := 0
 	for seqIdx, seq := range sequences {
 		bseq := seq.Sequence()
 		if len(bseq) < k {
@@ -60,71 +112,67 @@ func (ksg *KmerSetGroup) PrepareQueries(sequences []*obiseq.BioSequence) [][]Que
 				buckets[partition] = append(buckets[partition], QueryEntry{
 					Kmer:   kmer,
 					SeqIdx: seqIdx,
-					Pos:    sk.Start + localPos,
+					Pos:    sk.Start + localPos + 1,
 				})
 				localPos++
 				totalKmers++
 			}
 		}
 	}
 	// Sort each bucket by k-mer value for merge-scan
 	for p := range buckets {
-		sort.Slice(buckets[p], func(i, j int) bool {
+		slices.SortFunc(buckets[p], func(a, b QueryEntry) int {
-			return buckets[p][i].Kmer < buckets[p][j].Kmer
+			return cmp.Compare(a.Kmer, b.Kmer)
 		})
 	}
-	return buckets
+	return &PreparedQueries{
 		Buckets: buckets,
 		NSeqs:   len(sequences),
 		NKmers:  totalKmers,
 	}
 }
 // MatchBatch looks up pre-sorted queries against one set of the index.
 // Partitions are processed in parallel. For each partition, a merge-scan
 // compares the sorted queries against the sorted KDI stream.
 //
-// Returns a MatchResult mapping sequence index to sorted matched positions.
+// Returns a MatchResult where result[i] contains sorted matched positions
-func (ksg *KmerSetGroup) MatchBatch(setIndex int, queries [][]QueryEntry) MatchResult {
+// for sequence i.
 func (ksg *KmerSetGroup) MatchBatch(setIndex int, pq *PreparedQueries) MatchResult {
 	P := ksg.partitions
-	// Per-sequence result collectors
+	// Pre-allocated per-sequence results and mutexes.
-	var resultMu sync.Mutex
+	// Each partition goroutine appends to results[seqIdx] with mus[seqIdx] held.
-	resultMap := make(map[int]*seqMatchResult)
+	// Contention is low: a sequence's k-mers span many partitions, but each
-
+	// partition processes its queries sequentially and the critical section is tiny.
-	getResult := func(seqIdx int) *seqMatchResult {
+	results := make([][]int, pq.NSeqs)
-		resultMu.Lock()
+	mus := make([]sync.Mutex, pq.NSeqs)
 		sr, ok := resultMap[seqIdx]
 		if !ok {
 			sr = &seqMatchResult{}
 			resultMap[seqIdx] = sr
 		}
 		resultMu.Unlock()
 		return sr
 	}
 	var wg sync.WaitGroup
 	for p := 0; p < P; p++ {
-		if len(queries[p]) == 0 {
+		if len(pq.Buckets[p]) == 0 {
 			continue
 		}
 		wg.Add(1)
 		go func(part int) {
 			defer wg.Done()
-			ksg.matchPartition(setIndex, part, queries[part], getResult)
+			ksg.matchPartition(setIndex, part, pq.Buckets[part], results, mus)
 		}(p)
 	}
 	wg.Wait()
-	// Build final result with sorted positions
+	// Sort positions within each sequence
-	result := make(MatchResult, len(resultMap))
+	for i := range results {
-	for seqIdx, sr := range resultMap {
+		if len(results[i]) > 1 {
-		if len(sr.positions) > 0 {
+			slices.Sort(results[i])
 			sort.Ints(sr.positions)
 			result[seqIdx] = sr.positions
 		}
 	}
-	return result
+	return MatchResult(results)
 }
 // matchPartition processes one partition: opens the KDI reader (with index),
@@ -133,7 +181,8 @@ func (ksg *KmerSetGroup) matchPartition(
 	setIndex int,
 	partIndex int,
 	queries []QueryEntry, // sorted by Kmer
-	getResult func(int) *seqMatchResult,
+	results [][]int,
 	mus []sync.Mutex,
 ) {
 	r, err := NewKdiIndexedReader(ksg.partitionPath(setIndex, partIndex))
 	if err != nil {
@@ -161,6 +210,23 @@ func (ksg *KmerSetGroup) matchPartition(
 	for qi < len(queries) {
 		q := queries[qi]
 		// If the next query is far ahead, re-seek instead of linear scan.
 		// Only seek if we'd skip more k-mers than the index stride,
 		// otherwise linear scan through the buffer is faster than a syscall.
 		if r.index != nil && q.Kmer > currentKmer && r.Remaining() > uint64(r.index.stride) {
 			_, skipCount, found := r.index.FindOffset(q.Kmer)
 			if found && skipCount > r.read+uint64(r.index.stride) {
 				if err := r.SeekTo(q.Kmer); err == nil {
 					nextKmer, nextOk := r.Next()
 					if !nextOk {
 						return
 					}
 					currentKmer = nextKmer
 					ok = true
 				}
 			}
 		}
 		// Advance KDI stream until >= query kmer
 		for currentKmer < q.Kmer {
 			currentKmer, ok = r.Next()
@@ -173,10 +239,10 @@ func (ksg *KmerSetGroup) matchPartition(
 			// Match! Record all queries with this same k-mer value
 			matchedKmer := q.Kmer
 			for qi < len(queries) && queries[qi].Kmer == matchedKmer {
-				sr := getResult(queries[qi].SeqIdx)
+				idx := queries[qi].SeqIdx
-				sr.mu.Lock()
+				mus[idx].Lock()
-				sr.positions = append(sr.positions, queries[qi].Pos)
+				results[idx] = append(results[idx], queries[qi].Pos)
-				sr.mu.Unlock()
+				mus[idx].Unlock()
 				qi++
 			}
 		} else {
@@ -5,9 +5,13 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
 	"sync"
 	"sync/atomic"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
 	"github.com/DavidGamba/go-getoptions"
@@ -75,22 +79,36 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
 		}
 	}
-	// Read and process sequences
+	// Read and process sequences in parallel
 	sequences, err := obiconvert.CLIReadBioSequences(args...)
 	if err != nil {
 		return fmt.Errorf("failed to open sequence files: %w", err)
 	}
-	seqCount := 0
+	nworkers := obidefault.ParallelWorkers()
-	for sequences.Next() {
+	var seqCount atomic.Int64
-		batch := sequences.Get()
+	var wg sync.WaitGroup
 	consumer := func(iter obiiter.IBioSequence) {
 		defer wg.Done()
 		for iter.Next() {
 			batch := iter.Get()
 			for _, seq := range batch.Slice() {
 				builder.AddSequence(0, seq)
-			seqCount++
+				seqCount.Add(1)
 			}
 		}
 	}
-	log.Infof("Processed %d sequences", seqCount)
+	for i := 1; i < nworkers; i++ {
 		wg.Add(1)
 		go consumer(sequences.Split())
 	}
 	wg.Add(1)
 	go consumer(sequences)
 	wg.Wait()
 	log.Infof("Processed %d sequences", seqCount.Load())
 	// Finalize
 	ksg, err := builder.Close()
@@ -3,10 +3,12 @@ package obik
 import (
 	"context"
 	"fmt"
 	"sync"
 	log "github.com/sirupsen/logrus"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
@@ -14,56 +16,47 @@ import (
 	"github.com/DavidGamba/go-getoptions"
 )
-// matchSliceWorker creates a SeqSliceWorker that annotates each sequence
+// defaultMatchQueryThreshold is the minimum number of k-mer entries to
-// in a batch with k-mer match positions from the index.
+// accumulate before launching a MatchBatch. Larger values amortize the
-// For each set, an attribute "kmer_matched_<setID>" is added containing
+// cost of opening .kdi files across more query k-mers.
-// a sorted []int of 0-based positions where matched k-mers start.
+const defaultMatchQueryThreshold = 10_000_000
 func matchSliceWorker(ksg *obikmer.KmerSetGroup, setIndices []int) obiseq.SeqSliceWorker {
 	return func(batch obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
 		if len(batch) == 0 {
 			return batch, nil
 		}
-		// Build slice of *BioSequence for PrepareQueries
+// preparedBatch pairs a batch with its pre-computed queries.
-		seqs := make([]*obiseq.BioSequence, len(batch))
+type preparedBatch struct {
-		for i := range batch {
+	batch   obiiter.BioSequenceBatch
-			seqs[i] = batch[i]
+	seqs    []*obiseq.BioSequence
-		}
+	queries *obikmer.PreparedQueries
 }
-		// Prepare queries once (shared across sets)
+// accumulatedWork holds multiple prepared batches whose queries have been
-		queries := ksg.PrepareQueries(seqs)
+// merged into a single PreparedQueries. The flat seqs slice allows
-
+// MatchBatch results (indexed by merged SeqIdx) to be mapped back to
-		// Match against each selected set
+// the original sequences.
-		for _, setIdx := range setIndices {
+type accumulatedWork struct {
-			result := ksg.MatchBatch(setIdx, queries)
+	batches []obiiter.BioSequenceBatch // original batches in order
-
+	seqs    []*obiseq.BioSequence      // flat: seqs from all batches concatenated
-			setID := ksg.SetIDOf(setIdx)
+	queries *obikmer.PreparedQueries   // merged queries with rebased SeqIdx
 			if setID == "" {
 				setID = fmt.Sprintf("set_%d", setIdx)
 			}
 			attrName := "kmer_matched_" + setID
 			for seqIdx, positions := range result {
 				if len(positions) > 0 {
 					batch[seqIdx].SetAttribute(attrName, positions)
 				}
 			}
 		}
 		return batch, nil
 	}
 }
 // runMatch implements the "obik match" subcommand.
-// It reads sequences, looks up their k-mers in a disk-based index,
+//
-// and annotates each sequence with match positions.
+// Pipeline architecture (no shared mutable state between stages):
 //
 //	[input batches]
 //	     │  Split across nCPU goroutines
 //	     ▼
 //	PrepareQueries (CPU, parallel)
 //	     │  preparedCh
 //	     ▼
 //	Accumulate & MergeQueries (1 goroutine)
 //	     │  matchCh — fires when totalKmers >= threshold
 //	     ▼
 //	MatchBatch + annotate (1 goroutine, internal parallelism per partition)
 //	     │
 //	     ▼
 //	[output batches]
 func runMatch(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
-	if len(args) < 1 {
+	indexDir := CLIIndexDirectory()
 		return fmt.Errorf("usage: obik match [options] <index_directory> [sequence_files...]")
 	}
 	indexDir := args[0]
 	seqArgs := args[1:]
 	// Open the k-mer index
 	ksg, err := obikmer.OpenKmerSetGroup(indexDir)
@@ -86,14 +79,12 @@ func runMatch(ctx context.Context, opt *getoptions.GetOpt, args []string) error
 			return fmt.Errorf("no sets match the given patterns")
 		}
 	} else {
 		// All sets
 		setIndices = make([]int, ksg.Size())
 		for i := range setIndices {
 			setIndices[i] = i
 		}
 	}
 	// Log which sets we'll match
 	for _, idx := range setIndices {
 		id := ksg.SetIDOf(idx)
 		if id == "" {
@@ -102,21 +93,128 @@ func runMatch(ctx context.Context, opt *getoptions.GetOpt, args []string) error
 		log.Infof("Matching against set %d (%s): %d k-mers", idx, id, ksg.Len(idx))
 	}
-	// Read sequences
+	// Read input sequences
-	sequences, err := obiconvert.CLIReadBioSequences(seqArgs...)
+	sequences, err := obiconvert.CLIReadBioSequences(args...)
 	if err != nil {
 		return fmt.Errorf("failed to open sequence files: %w", err)
 	}
-	// Apply the batch worker
+	nworkers := obidefault.ParallelWorkers()
 	worker := matchSliceWorker(ksg, setIndices)
 	matched := sequences.MakeISliceWorker(
 		worker,
 		false,
 		obidefault.ParallelWorkers(),
 	)
-	obiconvert.CLIWriteBioSequences(matched, true)
+	// --- Stage 1: Prepare queries in parallel ---
 	preparedCh := make(chan preparedBatch, nworkers)
 	var prepWg sync.WaitGroup
 	preparer := func(iter obiiter.IBioSequence) {
 		defer prepWg.Done()
 		for iter.Next() {
 			batch := iter.Get()
 			slice := batch.Slice()
 			seqs := make([]*obiseq.BioSequence, len(slice))
 			for i, s := range slice {
 				seqs[i] = s
 			}
 			pq := ksg.PrepareQueries(seqs)
 			preparedCh <- preparedBatch{
 				batch:   batch,
 				seqs:    seqs,
 				queries: pq,
 			}
 		}
 	}
 	for i := 1; i < nworkers; i++ {
 		prepWg.Add(1)
 		go preparer(sequences.Split())
 	}
 	prepWg.Add(1)
 	go preparer(sequences)
 	go func() {
 		prepWg.Wait()
 		close(preparedCh)
 	}()
 	// --- Stage 2: Accumulate & merge queries ---
 	matchCh := make(chan *accumulatedWork, 2)
 	go func() {
 		defer close(matchCh)
 		var acc *accumulatedWork
 		for pb := range preparedCh {
 			if acc == nil {
 				acc = &accumulatedWork{
 					batches: []obiiter.BioSequenceBatch{pb.batch},
 					seqs:    pb.seqs,
 					queries: pb.queries,
 				}
 			} else {
 				// Merge this batch's queries into the accumulator
 				obikmer.MergeQueries(acc.queries, pb.queries)
 				acc.batches = append(acc.batches, pb.batch)
 				acc.seqs = append(acc.seqs, pb.seqs...)
 			}
 			// Flush when we exceed the threshold
 			if acc.queries.NKmers >= defaultMatchQueryThreshold {
 				matchCh <- acc
 				acc = nil
 			}
 		}
 		// Flush remaining
 		if acc != nil {
 			matchCh <- acc
 		}
 	}()
 	// --- Stage 3: Match & annotate ---
 	output := obiiter.MakeIBioSequence()
 	if sequences.IsPaired() {
 		output.MarkAsPaired()
 	}
 	output.Add(1)
 	go func() {
 		defer output.Done()
 		for work := range matchCh {
 			// Match against each selected set
 			for _, setIdx := range setIndices {
 				result := ksg.MatchBatch(setIdx, work.queries)
 				setID := ksg.SetIDOf(setIdx)
 				if setID == "" {
 					setID = fmt.Sprintf("set_%d", setIdx)
 				}
 				attrName := "kmer_matched_" + setID
 				for seqIdx, positions := range result {
 					if len(positions) > 0 {
 						work.seqs[seqIdx].SetAttribute(attrName, positions)
 					}
 				}
 			}
 			// Push annotated batches to output
 			for _, b := range work.batches {
 				output.Push(b)
 			}
 			// Help GC
 			work.seqs = nil
 			work.queries = nil
 		}
 	}()
 	go output.WaitAndClose()
 	obiconvert.CLIWriteBioSequences(output, true)
 	obiutils.WaitForLastPipe()
 	return nil
@@ -70,6 +70,7 @@ func OptionSet(opt *getoptions.GetOpt) {
 	// match: annotate sequences with k-mer match positions from an index
 	matchCmd := opt.NewCommand("match", "Annotate sequences with k-mer match positions from an index")
 	IndexDirectoryOptionSet(matchCmd)
 	obiconvert.InputOptionSet(matchCmd)
 	obiconvert.OutputOptionSet(matchCmd)
 	SetSelectionOptionSet(matchCmd)
@@ -280,6 +280,26 @@ func CLIKeepShorter() bool {
 	return _keepShorter
 }
 // ==============================
 // Match-specific options
 // ==============================
 var _indexDirectory = ""
 // IndexDirectoryOptionSet registers --index / -i (mandatory directory for match).
 func IndexDirectoryOptionSet(options *getoptions.GetOpt) {
 	options.StringVar(&_indexDirectory, "index", _indexDirectory,
 		options.Alias("i"),
 		options.Required(),
 		options.ArgName("DIRECTORY"),
 		options.Description("Path to the kmer index directory."))
 }
 // CLIIndexDirectory returns the --index directory path.
 func CLIIndexDirectory() string {
 	return _indexDirectory
 }
 // CLIIndexEntropyThreshold returns the entropy filter threshold for index building (0 = disabled).
 func CLIIndexEntropyThreshold() float64 {
 	return _indexEntropyThreshold