mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-26 05:50:52 +00:00
Refactor k-mer matching pipeline with improved concurrency and memory management
Refactor k-mer matching to use a pipeline architecture with improved concurrency and memory management: - Replace sort.Slice with slices.SortFunc and cmp.Compare for better performance - Introduce PreparedQueries struct to encapsulate query buckets with metadata - Implement MergeQueries function to merge query buckets from multiple batches - Rewrite MatchBatch to use pre-allocated results and mutexes instead of map-based accumulation - Add seek optimization in matchPartition to reduce linear scanning - Refactor match command to use a multi-stage pipeline with proper batching and merging - Add index directory option for match command - Improve parallel processing of sequence batches This refactoring improves performance by reducing memory allocations, optimizing k-mer lookup, and implementing a more efficient pipeline for large-scale k-mer matching operations.
This commit is contained in:
@@ -5,9 +5,13 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
@@ -75,22 +79,36 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
|
||||
}
|
||||
}
|
||||
|
||||
// Read and process sequences
|
||||
// Read and process sequences in parallel
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open sequence files: %w", err)
|
||||
}
|
||||
|
||||
seqCount := 0
|
||||
for sequences.Next() {
|
||||
batch := sequences.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
builder.AddSequence(0, seq)
|
||||
seqCount++
|
||||
nworkers := obidefault.ParallelWorkers()
|
||||
var seqCount atomic.Int64
|
||||
var wg sync.WaitGroup
|
||||
|
||||
consumer := func(iter obiiter.IBioSequence) {
|
||||
defer wg.Done()
|
||||
for iter.Next() {
|
||||
batch := iter.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
builder.AddSequence(0, seq)
|
||||
seqCount.Add(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount)
|
||||
for i := 1; i < nworkers; i++ {
|
||||
wg.Add(1)
|
||||
go consumer(sequences.Split())
|
||||
}
|
||||
wg.Add(1)
|
||||
go consumer(sequences)
|
||||
wg.Wait()
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount.Load())
|
||||
|
||||
// Finalize
|
||||
ksg, err := builder.Close()
|
||||
|
||||
Reference in New Issue
Block a user