From a186bd1c9219d0e9aa3be1bda5f1b60b1f06037d Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Tue, 5 May 2026 18:05:26 +0200
Subject: [PATCH 1/5] fix: validate non-empty sequence IDs in FASTA and FASTQ
 writers

Adds a pre-processing guard that checks for empty sequence identifiers before formatting. This prevents malformed FASTA output and stops downstream processing of invalid FASTQ data by terminating early. The check is placed before existing sequence-length validations to enforce non-empty IDs during batch processing.
---
 pkg/obiformats/fastseq_write_fasta.go | 3 +++
 pkg/obiformats/fastseq_write_fastq.go | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/pkg/obiformats/fastseq_write_fasta.go b/pkg/obiformats/fastseq_write_fasta.go
index 88b029f..a79584e 100644
--- a/pkg/obiformats/fastseq_write_fasta.go
+++ b/pkg/obiformats/fastseq_write_fasta.go
@@ -90,6 +90,9 @@ func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, ski
 	log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
 
 	for _, seq := range batch.Slice() {
+		if len(seq.Id()) == 0 {
+			log.Fatalf("Sequence identifier is empty")
+		}
 		if seq.Len() > 0 {
 			// Write header directly into bs — no intermediate string
 			bs.WriteByte('>')
diff --git a/pkg/obiformats/fastseq_write_fastq.go b/pkg/obiformats/fastseq_write_fastq.go
index 845993a..679230c 100644
--- a/pkg/obiformats/fastseq_write_fastq.go
+++ b/pkg/obiformats/fastseq_write_fastq.go
@@ -64,6 +64,9 @@ func FormatFastqBatch(batch obiiter.BioSequenceBatch,
 	first := true
 
 	for _, seq := range batch.Slice() {
+		if len(seq.Id()) == 0 {
+			log.Fatalf("Sequence identifier is empty")
+		}
 		if seq.Len() > 0 {
 			_formatFastq(&bs, seq, formater)
 

From cecf90fa40b052c3a0a36dca6ff29b629bac8750 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Thu, 14 May 2026 20:57:05 +0800
Subject: [PATCH 2/5] feat: add min/max filtering and saturating subtraction
 utilities

Introduce generic and reflection-based utilities for filtering slices and maps by minimum/maximum thresholds, along with saturating subtraction. The `obiutils` package provides type-safe generic implementations alongside dynamic reflection dispatchers to handle arbitrary ordered and numeric types. These are exposed as GVAL expression functions in `obiseq`, extending the language's built-in filtering and numeric capabilities.
---
 pkg/obiseq/language.go |  13 +++
 pkg/obiutils/minmax.go | 241 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 254 insertions(+)

diff --git a/pkg/obiseq/language.go b/pkg/obiseq/language.go
index e248b6a..9f0297b 100644
--- a/pkg/obiseq/language.go
+++ b/pkg/obiseq/language.go
@@ -141,6 +141,19 @@ var OBILang = gval.NewLanguage(
 	gval.Function("max", func(args ...interface{}) (interface{}, error) {
 		return obiutils.Max(args[0])
 	}),
+
+	gval.Function("filtermin", func(args ...interface{}) (interface{}, error) {
+		return obiutils.FilterMin(args[0], args[1])
+	}),
+
+	gval.Function("filtermax", func(args ...interface{}) (interface{}, error) {
+		return obiutils.FilterMax(args[0], args[1])
+	}),
+
+	gval.Function("saturatingsub", func(args ...interface{}) (interface{}, error) {
+		return obiutils.SaturatingSub(args[0], args[1])
+	}),
+
 	gval.Function("contains", func(args ...interface{}) (interface{}, error) {
 		if obiutils.IsAMap(args[0]) {
 			val := reflect.ValueOf(args[0]).MapIndex(reflect.ValueOf(args[1]))
diff --git a/pkg/obiutils/minmax.go b/pkg/obiutils/minmax.go
index b01dfc1..1312e06 100644
--- a/pkg/obiutils/minmax.go
+++ b/pkg/obiutils/minmax.go
@@ -34,6 +34,26 @@ func MinMaxSlice[T constraints.Ordered](vec []T) (min, max T) {
 	return
 }
 
+func FilterMinSlice[T constraints.Ordered](vec []T, minimum T) []T {
+	result := make([]T, 0, len(vec))
+	for _, v := range vec {
+		if v >= minimum {
+			result = append(result, v)
+		}
+	}
+	return result
+}
+
+func FilterMaxSlice[T constraints.Ordered](vec []T, maximum T) []T {
+	result := make([]T, 0, len(vec))
+	for _, v := range vec {
+		if v <= maximum {
+			result = append(result, v)
+		}
+	}
+	return result
+}
+
 func MaxMap[K comparable, T constraints.Ordered](values map[K]T) (K, T, error) {
 	var maxKey K
 	var maxValue T
@@ -73,6 +93,46 @@ func MinMap[K comparable, T constraints.Ordered](values map[K]T) (K, T, error) {
 	return minKey, minValue, nil
 }
 
+func FilterMinMap[K comparable, T constraints.Ordered](values map[K]T, minimum T) map[K]T {
+	result := make(map[K]T)
+	for k, v := range values {
+		if v >= minimum {
+			result[k] = v
+		}
+	}
+	return result
+}
+
+func FilterMaxMap[K comparable, T constraints.Ordered](values map[K]T, maximum T) map[K]T {
+	result := make(map[K]T)
+	for k, v := range values {
+		if v <= maximum {
+			result[k] = v
+		}
+	}
+	return result
+}
+
+func SaturatingSubSlice[T Numeric](vec []T, sub T) []T {
+	result := make([]T, len(vec))
+	for i, v := range vec {
+		if v > sub {
+			result[i] = v - sub
+		}
+	}
+	return result
+}
+
+func SaturatingSubMap[K comparable, T Numeric](values map[K]T, sub T) map[K]T {
+	result := make(map[K]T)
+	for k, v := range values {
+		if v > sub {
+			result[k] = v - sub
+		}
+	}
+	return result
+}
+
 // Min returns the smallest element in a slice/array or map,
 // or the value itself if data is a single comparable value.
 // Returns an error if the container is empty or the type is unsupported.
@@ -135,6 +195,116 @@ func Max(data interface{}) (interface{}, error) {
 	}
 }
 
+func FilterMin(data interface{}, minimum interface{}) (interface{}, error) {
+	v := reflect.ValueOf(data)
+	switch v.Kind() {
+	case reflect.Slice, reflect.Array:
+		if v.Len() == 0 {
+			return nil, errors.New("empty slice or array")
+		}
+		return filterMinFromIterable(v, minimum)
+	case reflect.Map:
+		if v.Len() == 0 {
+			return nil, errors.New("empty map")
+		}
+		return filterMinFromMap(v, minimum)
+	default:
+		if !isOrderedKind(v.Kind()) {
+			return nil, fmt.Errorf("unsupported type: %s", v.Kind())
+		}
+		return data, nil
+	}
+}
+
+func FilterMax(data interface{}, maximum interface{}) (interface{}, error) {
+	v := reflect.ValueOf(data)
+	switch v.Kind() {
+	case reflect.Slice, reflect.Array:
+		if v.Len() == 0 {
+			return nil, errors.New("empty slice or array")
+		}
+		return filterMaxFromIterable(v, maximum)
+	case reflect.Map:
+		if v.Len() == 0 {
+			return nil, errors.New("empty map")
+		}
+		return filterMaxFromMap(v, maximum)
+	default:
+		if !isOrderedKind(v.Kind()) {
+			return nil, fmt.Errorf("unsupported type: %s", v.Kind())
+		}
+		return data, nil
+	}
+}
+
+func SaturatingSub(data interface{}, sub interface{}) (interface{}, error) {
+	v := reflect.ValueOf(data)
+	switch v.Kind() {
+	case reflect.Slice, reflect.Array:
+		return saturatingSubFromIterable(v, sub)
+	case reflect.Map:
+		return saturatingSubFromMap(v, sub)
+	default:
+		if !isNumericKind(v.Kind()) {
+			return nil, fmt.Errorf("unsupported type: %s", v.Kind())
+		}
+		r, err := saturatingSubValues(v, reflect.ValueOf(sub))
+		if err != nil {
+			return nil, err
+		}
+		return r.Interface(), nil
+	}
+}
+
+func saturatingSubFromIterable(v reflect.Value, sub interface{}) (interface{}, error) {
+	subVal := reflect.ValueOf(sub)
+	result := reflect.MakeSlice(v.Type(), v.Len(), v.Len())
+	for i := 0; i < v.Len(); i++ {
+		r, err := saturatingSubValues(v.Index(i), subVal)
+		if err != nil {
+			return nil, err
+		}
+		result.Index(i).Set(r)
+	}
+	return result.Interface(), nil
+}
+
+func saturatingSubFromMap(v reflect.Value, sub interface{}) (interface{}, error) {
+	subVal := reflect.ValueOf(sub)
+	result := reflect.MakeMap(v.Type())
+	for _, key := range v.MapKeys() {
+		r, err := saturatingSubValues(v.MapIndex(key), subVal)
+		if err != nil {
+			return nil, err
+		}
+		if !r.IsZero() {
+			result.SetMapIndex(key, r)
+		}
+	}
+	return result.Interface(), nil
+}
+
+func saturatingSubValues(a, b reflect.Value) (reflect.Value, error) {
+	result := reflect.New(a.Type()).Elem()
+	switch a.Kind() {
+	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
+		if av, bv := a.Int(), b.Int(); av > bv {
+			result.SetInt(av - bv)
+		}
+	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
+		if av, bv := a.Uint(), b.Uint(); av > bv {
+			result.SetUint(av - bv)
+		}
+	case reflect.Float32, reflect.Float64:
+		if av, bv := a.Float(), b.Float(); av > bv {
+			result.SetFloat(av - bv)
+		}
+	default:
+		return reflect.Value{}, fmt.Errorf("unsupported type for saturating subtraction: %s", a.Kind())
+	}
+	return result, nil
+}
+
 // maxFromIterable scans a slice/array to find the maximum.
 func maxFromIterable(v reflect.Value) (interface{}, error) {
 	var best reflect.Value
@@ -165,6 +335,66 @@ func minFromIterable(v reflect.Value) (interface{}, error) {
 	return minVal.Interface(), nil
 }
 
+func filterMinFromIterable(v reflect.Value, minimum interface{}) (interface{}, error) {
+	minVal := reflect.ValueOf(minimum)
+	result := reflect.MakeSlice(v.Type(), 0, v.Len())
+	for i := 0; i < v.Len(); i++ {
+		elem := v.Index(i)
+		if !isOrderedKind(elem.Kind()) {
+			return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
+		}
+		if !less(elem, minVal) { // elem >= minimum
+			result = reflect.Append(result, elem)
+		}
+	}
+	return result.Interface(), nil
+}
+
+func filterMaxFromIterable(v reflect.Value, maximum interface{}) (interface{}, error) {
+	maxVal := reflect.ValueOf(maximum)
+	result := reflect.MakeSlice(v.Type(), 0, v.Len())
+	for i := 0; i < v.Len(); i++ {
+		elem := v.Index(i)
+		if !isOrderedKind(elem.Kind()) {
+			return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
+		}
+		if !greater(elem, maxVal) { // elem <= maximum
+			result = reflect.Append(result, elem)
+		}
+	}
+	return result.Interface(), nil
+}
+
+func filterMinFromMap(v reflect.Value, minimum interface{}) (interface{}, error) {
+	minVal := reflect.ValueOf(minimum)
+	result := reflect.MakeMap(v.Type())
+	for _, key := range v.MapKeys() {
+		elem := v.MapIndex(key)
+		if !isOrderedKind(elem.Kind()) {
+			return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
+		}
+		if !less(elem, minVal) { // elem >= minimum
+			result.SetMapIndex(key, elem)
+		}
+	}
+	return result.Interface(), nil
+}
+
+func filterMaxFromMap(v reflect.Value, maximum interface{}) (interface{}, error) {
+	maxVal := reflect.ValueOf(maximum)
+	result := reflect.MakeMap(v.Type())
+	for _, key := range v.MapKeys() {
+		elem := v.MapIndex(key)
+		if !isOrderedKind(elem.Kind()) {
+			return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
+		}
+		if !greater(elem, maxVal) { // elem <= maximum
+			result.SetMapIndex(key, elem)
+		}
+	}
+	return result.Interface(), nil
+}
+
 // maxFromMap scans map values to find the maximum.
 func maxFromMap(v reflect.Value) (interface{}, error) {
 	var best reflect.Value
@@ -199,6 +429,17 @@ func minFromMap(v reflect.Value) (interface{}, error) {
 	return minVal.Interface(), nil
 }
 
+func isNumericKind(k reflect.Kind) bool {
+	switch k {
+	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64,
+		reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64,
+		reflect.Float32, reflect.Float64:
+		return true
+	default:
+		return false
+	}
+}
+
 // isOrderedKind reports whether k supports comparison ordering.
 func isOrderedKind(k reflect.Kind) bool {
 	switch k {

From af7ae3d60c8a7a680b392b2b07223ddf2d18d4b7 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Sun, 17 May 2026 14:52:31 +0800
Subject: [PATCH 3/5] Correct Shannon entropy bias for canonical k-mers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multiple raw k-mers collapsing into identical circular canonical forms introduce bias into complexity estimates. This change pre-computes `log(class_size)` tables and per-word-size maximum entropy bounds. The `KmerEntropy` function and `KmerEntropyFilter` are updated to apply the corrected formula `(log(N) + Σf·log(s) - Σf·log(f))/N / emax`, ensuring accurate sequence complexity estimation.
---
 pkg/obikmer/entropy.go | 145 +++++++++++++++++++++++++----------------
 1 file changed, 90 insertions(+), 55 deletions(-)

diff --git a/pkg/obikmer/entropy.go b/pkg/obikmer/entropy.go
index 94f8ab7..8281460 100644
--- a/pkg/obikmer/entropy.go
+++ b/pkg/obikmer/entropy.go
@@ -4,22 +4,21 @@ import "math"
 
 // KmerEntropy computes the entropy of a single encoded k-mer.
 //
-// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
+// The algorithm mirrors the Rust obiskbuilder entropy: it decodes the k-mer
 // to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
 // normalizes them by circular canonical form, counts their frequencies, and
-// computes Shannon entropy normalized by the maximum possible entropy.
+// computes Shannon entropy corrected for class sizes, normalized by the
+// maximum possible entropy over 4^ws raw bins.
 // The returned value is the minimum entropy across all word sizes.
 //
+// Correction for small sequences: the raw entropy H = log(N) - Σ f·log(f)/N
+// under-estimates the true complexity when many raw words collapse to the same
+// canonical form.  Adding Σ f·log(class_size)/N recovers the entropy of the
+// underlying uncollapsed distribution (assuming uniform mixing within each
+// equivalence class).
+//
 // A value close to 0 indicates very low complexity (e.g. "AAAA..."),
 // while a value close to 1 indicates high complexity.
-//
-// Parameters:
-//   - kmer: the encoded k-mer (2 bits per base)
-//   - k: the k-mer size
-//   - levelMax: maximum sub-word size for entropy (typically 6)
-//
-// Returns:
-//   - minimum normalized entropy across all word sizes 1..levelMax
 func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 	if k < 1 || levelMax < 1 {
 		return 1.0
@@ -35,7 +34,7 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 	var seqBuf [32]byte
 	seq := DecodeKmer(kmer, k, seqBuf[:])
 
-	// Pre-compute nLogN lookup (same as lowmask)
+	// Pre-compute nLogN lookup
 	nLogN := make([]float64, k+1)
 	for i := 1; i <= k; i++ {
 		nLogN[i] = float64(i) * math.Log(float64(i))
@@ -51,6 +50,23 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 		}
 	}
 
+	// Build ln(class_size) tables: for each canonical form, how many raw
+	// words map to it under circular normalization.
+	classLogSizeTables := make([][]float64, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		tableSize := 1 << (ws * 2)
+		classSize := make([]int, tableSize)
+		for code := 0; code < tableSize; code++ {
+			classSize[normTables[ws][code]]++
+		}
+		classLogSizeTables[ws] = make([]float64, tableSize)
+		for j := 0; j < tableSize; j++ {
+			if classSize[j] > 0 {
+				classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
+			}
+		}
+	}
+
 	minEntropy := math.MaxFloat64
 
 	for ws := 1; ws <= levelMax; ws++ {
@@ -75,23 +91,13 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 			table[normWord]++
 		}
 
-		// Compute Shannon entropy
+		// Compute emax over 4^ws raw bins (uncollapsed distribution).
 		floatNwords := float64(nwords)
 		logNwords := math.Log(floatNwords)
-
-		var sumNLogN float64
-		for j := 0; j < tableSize; j++ {
-			n := table[j]
-			if n > 0 {
-				sumNLogN += nLogN[n]
-			}
-		}
-
-		// Compute emax (maximum possible entropy for this word size)
-		na := CanonicalCircularKmerCount(ws)
+		na := tableSize // 4^ws
 		var emax float64
 		if nwords < na {
-			emax = math.Log(float64(nwords))
+			emax = logNwords
 		} else {
 			cov := nwords / na
 			remains := nwords - (na * cov)
@@ -105,7 +111,19 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 			continue
 		}
 
-		entropy := (logNwords - sumNLogN/floatNwords) / emax
+		// Accumulate Σ f·log(f) and Σ f·log(class_size) over canonical forms.
+		classLogSize := classLogSizeTables[ws]
+		var sumNLogN, sumClassLogN float64
+		for j := 0; j < tableSize; j++ {
+			n := table[j]
+			if n > 0 {
+				sumNLogN += nLogN[n]
+				sumClassLogN += float64(n) * classLogSize[j]
+			}
+		}
+
+		// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
+		entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
 		if entropy < 0 {
 			entropy = 0
 		}
@@ -129,24 +147,20 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 // IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
 // Each goroutine must create its own instance via NewKmerEntropyFilter.
 type KmerEntropyFilter struct {
-	k          int
-	levelMax   int
-	threshold  float64
-	nLogN      []float64
-	normTables [][]int
-	emaxValues []float64
-	logNwords  []float64
+	k                   int
+	levelMax            int
+	threshold           float64
+	nLogN               []float64
+	normTables          [][]int
+	classLogSizeTables  [][]float64
+	emaxValues          []float64
+	logNwords           []float64
 	// Pre-allocated frequency tables reused across Entropy() calls.
 	// One per word size (index 0 unused). Reset to zero before each use.
 	freqTables [][]int
 }
 
 // NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
-//
-// Parameters:
-//   - k: the k-mer size
-//   - levelMax: maximum sub-word size for entropy (typically 6)
-//   - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
 func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
 	if levelMax >= k {
 		levelMax = k - 1
@@ -169,20 +183,38 @@ func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter
 		}
 	}
 
+	// ln(class_size) for each canonical form under circular normalization.
+	classLogSizeTables := make([][]float64, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		tableSize := 1 << (ws * 2)
+		classSize := make([]int, tableSize)
+		for code := 0; code < tableSize; code++ {
+			classSize[normTables[ws][code]]++
+		}
+		classLogSizeTables[ws] = make([]float64, tableSize)
+		for j := 0; j < tableSize; j++ {
+			if classSize[j] > 0 {
+				classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
+			}
+		}
+	}
+
+	// Pre-compute emax and logNwords per word size.
+	// emax uses 4^ws raw bins to match the corrected entropy.
 	emaxValues := make([]float64, levelMax+1)
 	logNwords := make([]float64, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		nw := k - ws + 1
-		na := CanonicalCircularKmerCount(ws)
+		na := 1 << (ws * 2) // 4^ws raw bins
+		floatNw := float64(nw)
+		logNwords[ws] = math.Log(floatNw)
 		if nw < na {
-			logNwords[ws] = math.Log(float64(nw))
-			emaxValues[ws] = math.Log(float64(nw))
+			emaxValues[ws] = logNwords[ws]
 		} else {
 			cov := nw / na
 			remains := nw - (na * cov)
-			f1 := float64(cov) / float64(nw)
-			f2 := float64(cov+1) / float64(nw)
-			logNwords[ws] = math.Log(float64(nw))
+			f1 := float64(cov) / floatNw
+			f2 := float64(cov+1) / floatNw
 			emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
 				float64(remains)*f2*math.Log(f2))
 		}
@@ -195,14 +227,15 @@ func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter
 	}
 
 	return &KmerEntropyFilter{
-		k:          k,
-		levelMax:   levelMax,
-		threshold:  threshold,
-		nLogN:      nLogN,
-		normTables: normTables,
-		emaxValues: emaxValues,
-		logNwords:  logNwords,
-		freqTables: freqTables,
+		k:                  k,
+		levelMax:           levelMax,
+		threshold:          threshold,
+		nLogN:              nLogN,
+		normTables:         normTables,
+		classLogSizeTables: classLogSizeTables,
+		emaxValues:         emaxValues,
+		logNwords:          logNwords,
+		freqTables:         freqTables,
 	}
 }
 
@@ -236,7 +269,7 @@ func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
 		// Count circular-canonical sub-word frequencies
 		tableSize := 1 << (ws * 2)
 		table := ef.freqTables[ws]
-		clear(table) // reset to zero
+		clear(table)
 		mask := (1 << (ws * 2)) - 1
 		normTable := ef.normTables[ws]
 
@@ -251,19 +284,21 @@ func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
 			table[normWord]++
 		}
 
-		// Compute Shannon entropy
 		floatNwords := float64(nwords)
 		logNwords := ef.logNwords[ws]
+		classLogSize := ef.classLogSizeTables[ws]
 
-		var sumNLogN float64
+		var sumNLogN, sumClassLogN float64
 		for j := 0; j < tableSize; j++ {
 			n := table[j]
 			if n > 0 {
 				sumNLogN += ef.nLogN[n]
+				sumClassLogN += float64(n) * classLogSize[j]
 			}
 		}
 
-		entropy := (logNwords - sumNLogN/floatNwords) / emax
+		// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
+		entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
 		if entropy < 0 {
 			entropy = 0
 		}

From dcdaf9e372ccca750b3b38f19cec2b76dc38c3bd Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Mon, 1 Jun 2026 13:18:44 +0200
Subject: [PATCH 4/5] feat: support map and slice types in OBI attributes

Extends OBI header parsing to recognize and deserialize JSON-like arrays and objects. Introduces safe conversion utilities in `obiutils` to cast generic interface values into typed maps, and exposes them via new `BioSequence` methods. Header values are now marshaled, quote-normalized, and formatted for map and slice types.
---
 pkg/obiformats/fastseq_obi_header.go | 99 ++++++++++++++++++++++++----
 pkg/obiseq/attributes.go             | 18 +++++
 pkg/obiseq/biosequence_test.go       |  2 +-
 pkg/obiutils/cast_interface.go       | 38 +++++++++++
 4 files changed, 144 insertions(+), 13 deletions(-)

diff --git a/pkg/obiformats/fastseq_obi_header.go b/pkg/obiformats/fastseq_obi_header.go
index 79a45b1..fb2fb78 100644
--- a/pkg/obiformats/fastseq_obi_header.go
+++ b/pkg/obiformats/fastseq_obi_header.go
@@ -146,6 +146,65 @@ func __match__key__(text []byte) []int {
 	return []int{} // Not a key
 }
 
+func __match__array__(text []byte) []int {
+
+	state := 0
+	level := 0
+	start := 0
+	instring := byte(0)
+
+	for i, r := range text {
+		if state == 2 {
+			if r == ';' {
+				return []int{start, i + 1}
+			}
+			if r != ' ' && r != '\t' {
+				return []int{}
+			}
+		}
+
+		if state == 0 {
+			if r == '[' {
+				level++
+				state++
+				start = i
+				continue
+			}
+			if r != ' ' && r != '\t' {
+				return []int{}
+			}
+			continue
+		}
+
+		// state == 1: inside the array
+		if instring != 0 {
+			if r == instring {
+				instring = 0
+			}
+			continue
+		}
+
+		if r == '"' || r == '\'' {
+			instring = r
+			continue
+		}
+
+		if r == '[' || r == '{' {
+			level++
+			continue
+		}
+
+		if r == ']' || r == '}' {
+			level--
+			if level == 0 {
+				state++
+			}
+		}
+	}
+
+	return []int{}
+}
+
 func __match__general__(text []byte) []int {
 
 	for i, r := range text {
@@ -242,6 +301,21 @@ func ParseOBIFeatures(text string, annotations obiseq.Annotation) string {
 					stop = m[1] + 1
 				} else {
 
+					// array value
+					m = __match__array__(part)
+					if len(m) > 0 {
+						bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)])
+						j := bytes.ReplaceAll(bvalue, []byte("'"), []byte(`"`))
+						j = __obi_header_map_int_key__.ReplaceAll(j, []byte(`$1"$2":`))
+						arr, err := _parse_json_array_interface(j)
+						if err != nil {
+							value = string(bvalue)
+						} else {
+							value = arr
+						}
+						stop = m[1] + 1
+					} else {
+
 					// Generic value
 
 					// m = __obi_header_value_general_pattern__.FindIndex(part)
@@ -264,6 +338,7 @@ func ParseOBIFeatures(text string, annotations obiseq.Annotation) string {
 						// no value
 						break
 					} // End of No value
+					} // End of not array
 				} // End of not dict
 			} // End of not string
 		} // End of not numeric
@@ -327,19 +402,19 @@ func WriteFastSeqOBIHeade(buffer *bytes.Buffer, sequence *obiseq.BioSequence) {
 					buffer.WriteString(fmt.Sprintf("%s=", key))
 					buffer.Write(tv)
 					buffer.WriteString("; ")
-				case map[string]int,
-					map[string]string,
-					map[string]interface{}:
-					tv, err := obiutils.JsonMarshal(t)
-					if err != nil {
-						log.Fatalf("Cannot convert %v value", value)
-					}
-					tv = bytes.ReplaceAll(tv, []byte(`"`), []byte("'"))
-					buffer.WriteString(fmt.Sprintf("%s=", key))
-					buffer.Write(tv)
-					buffer.WriteString("; ")
 				default:
-					buffer.WriteString(fmt.Sprintf("%s=%v; ", key, value))
+					if obiutils.IsAMap(value) || obiutils.IsASlice(value) || obiutils.IsAnArray(value) {
+						tv, err := obiutils.JsonMarshal(t)
+						if err != nil {
+							log.Fatalf("Cannot convert %v value", value)
+						}
+						tv = bytes.ReplaceAll(tv, []byte(`"`), []byte("'"))
+						buffer.WriteString(fmt.Sprintf("%s=", key))
+						buffer.Write(tv)
+						buffer.WriteString("; ")
+					} else {
+						buffer.WriteString(fmt.Sprintf("%s=%v; ", key, value))
+					}
 				}
 			}
 		}
diff --git a/pkg/obiseq/attributes.go b/pkg/obiseq/attributes.go
index dd4699f..d4d6884 100644
--- a/pkg/obiseq/attributes.go
+++ b/pkg/obiseq/attributes.go
@@ -364,6 +364,24 @@ func (s *BioSequence) GetIntSlice(key string) ([]int, bool) {
 	return val, ok
 }
 
+func (s *BioSequence) GetMapOfIntSlice(key string) (map[string][]int, bool) {
+	v, ok := s.GetAttribute(key)
+	if !ok {
+		return nil, false
+	}
+	val, err := obiutils.InterfaceToMapOfIntSlice(v)
+	return val, err == nil
+}
+
+func (s *BioSequence) GetMapOfStringSlice(key string) (map[string][]string, bool) {
+	v, ok := s.GetAttribute(key)
+	if !ok {
+		return nil, false
+	}
+	val, err := obiutils.InterfaceToMapOfStringSlice(v)
+	return val, err == nil
+}
+
 // Count returns the value of the "count" attribute of the BioSequence.
 //
 // The count of a sequence is the number of times it has been observed in the dataset.
diff --git a/pkg/obiseq/biosequence_test.go b/pkg/obiseq/biosequence_test.go
index 5e51203..8aae1d6 100644
--- a/pkg/obiseq/biosequence_test.go
+++ b/pkg/obiseq/biosequence_test.go
@@ -103,7 +103,7 @@ func TestNewBioSequence(t *testing.T) {
 // Return type: None.
 func TestNewBioSequenceWithQualities(t *testing.T) {
 	id := "123"
-	sequence := []byte("ATGC")
+	sequence := []byte("atgc")
 	definition := "DNA sequence"
 	qualities := []byte("1234")
 
diff --git a/pkg/obiutils/cast_interface.go b/pkg/obiutils/cast_interface.go
index 0218ffd..b82e4fe 100644
--- a/pkg/obiutils/cast_interface.go
+++ b/pkg/obiutils/cast_interface.go
@@ -276,6 +276,44 @@ func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
 	return
 }
 
+func InterfaceToMapOfIntSlice(i interface{}) (val map[string][]int, err error) {
+	err = nil
+	switch m := i.(type) {
+	case map[string][]int:
+		val = m
+	case map[string]interface{}:
+		val = make(map[string][]int, len(m))
+		for k, v := range m {
+			val[k], err = InterfaceToIntSlice(v)
+			if err != nil {
+				return
+			}
+		}
+	default:
+		err = &NotAMapInt{"value attribute cannot be casted to a map[string][]int"}
+	}
+	return
+}
+
+func InterfaceToMapOfStringSlice(i interface{}) (val map[string][]string, err error) {
+	err = nil
+	switch m := i.(type) {
+	case map[string][]string:
+		val = m
+	case map[string]interface{}:
+		val = make(map[string][]string, len(m))
+		for k, v := range m {
+			val[k], err = InterfaceToStringSlice(v)
+			if err != nil {
+				return
+			}
+		}
+	default:
+		err = &NotAMapInt{"value attribute cannot be casted to a map[string][]string"}
+	}
+	return
+}
+
 func InterfaceToStringSlice(i interface{}) (val []string, err error) {
 	err = nil
 

From 930fe5f1ba5c0db974d76542e21d52cea8c98661 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Mon, 1 Jun 2026 13:22:58 +0200
Subject: [PATCH 5/5] Release 4.4.43

---
 pkg/obioptions/version.go | 2 +-
 version.txt               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go
index ef7733c..ded8d4f 100644
--- a/pkg/obioptions/version.go
+++ b/pkg/obioptions/version.go
@@ -3,7 +3,7 @@ package obioptions
 // Version is automatically updated by the Makefile from version.txt
 // The patch number (third digit) is incremented on each push to the repository
 
-var _Version = "Release 4.4.42"
+var _Version = "Release 4.4.43"
 
 // Version returns the version of the obitools package.
 //
diff --git a/version.txt b/version.txt
index cdc0aa1..9b51232 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-4.4.42
+4.4.43