From 373464cb0623a80ef906049487be63432ca20772 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Fri, 30 Aug 2024 11:17:33 +0200
Subject: [PATCH] On development genome skim tools

---
 cmd/obitools/obikmersimcount/main.go  |  6 +--
 pkg/obialign/readalign.go             | 18 +++++--
 pkg/obifp/uint128.go                  |  2 +-
 pkg/obifp/uint64.go                   | 10 +++-
 pkg/obikmer/encodefourmer.go          | 24 +++++++--
 pkg/obikmer/kmermap.go                | 75 +++++++++++++++++++++------
 pkg/obioptions/options.go             |  4 +-
 pkg/obioptions/version.go             |  2 +-
 pkg/obitools/obikmersim/obikmersim.go | 44 ++++++++++------
 pkg/obitools/obikmersim/options.go    |  9 ++++
 10 files changed, 144 insertions(+), 50 deletions(-)

diff --git a/cmd/obitools/obikmersimcount/main.go b/cmd/obitools/obikmersimcount/main.go
index a0496c1..d155ef8 100644
--- a/cmd/obitools/obikmersimcount/main.go
+++ b/cmd/obitools/obikmersimcount/main.go
@@ -44,10 +44,10 @@ func main() {
 
 	obiconvert.OpenSequenceDataErrorMessage(args, err)
 
-	selected := obikmersim.CLILookForSharedKmers(sequences)
-	topull, err := obiconvert.CLIWriteBioSequences(selected, false)
+	counted := obikmersim.CLILookForSharedKmers(sequences)
+	topull, err := obiconvert.CLIWriteBioSequences(counted, false)
 
-	if err == nil {
+	if err != nil {
 		log.Panic(err)
 	}
 
diff --git a/pkg/obialign/readalign.go b/pkg/obialign/readalign.go
index 10a2e65..4c1136a 100644
--- a/pkg/obialign/readalign.go
+++ b/pkg/obialign/readalign.go
@@ -43,16 +43,25 @@ func ReadAlign(seqA, seqB *obiseq.BioSequence,
 		directAlignment = false
 	}
 
-	if shift > 0 {
+	// Compute the overlapping region length
+	switch {
+	case shift > 0:
 		over = seqA.Len() - shift
-	} else {
+	case shift < 0:
 		over = seqB.Len() + shift
+	default:
+		over = min(seqA.Len(), seqB.Len())
 	}
 
+	// log.Warnf("fw/fw: %v shift=%d fastCount=%d/over=%d fastScore=%f",
+	// 	directAlignment, shift, fastCount, over, fastScore)
+
+	// log.Warnf(("seqA: %s\nseqB: %s\n"), seqA.String(), seqB.String())
+
 	// At least one mismatch exists in the overlaping region
 	if fastCount+3 < over {
 
-		if shift > 0 {
+		if shift > 0 || (shift == 0 && seqB.Len() >= seqA.Len()) {
 			startA = shift - delta
 			if startA < 0 {
 				startA = 0
@@ -105,7 +114,7 @@ func ReadAlign(seqA, seqB *obiseq.BioSequence,
 
 		// Both overlaping regions are identicals
 
-		if shift > 0 {
+		if shift > 0 || (shift == 0 && seqB.Len() >= seqA.Len()) {
 			startA = shift
 			startB = 0
 			extra5 = -startA
@@ -123,6 +132,7 @@ func ReadAlign(seqA, seqB *obiseq.BioSequence,
 			extra3 = partLen - seqA.Len()
 			qualSeqA = seqA.Qualities()[:partLen]
 		}
+
 		score = 0
 		for i, qualA := range qualSeqA {
 			qualB := qualSeqB[i]
diff --git a/pkg/obifp/uint128.go b/pkg/obifp/uint128.go
index 0ecdfc5..2c62928 100644
--- a/pkg/obifp/uint128.go
+++ b/pkg/obifp/uint128.go
@@ -145,7 +145,7 @@ func (u Uint128) Mul(v Uint128) Uint128 {
 	p2, p3 := bits.Mul64(u.w0, v.w1)
 	hi, c0 := bits.Add64(hi, p1, 0)
 	hi, c1 := bits.Add64(hi, p3, c0)
-	if (u.w1 != 0 && v.w1 != 0) || p0 != 0 || p2 != 0 || c1 != 0 {
+	if p0 != 0 || p2 != 0 || c1 != 0 {
 		log.Panicf("Uint128 overflow at Mul(%v, %v)", u, v)
 	}
 	return Uint128{w1: hi, w0: lo}
diff --git a/pkg/obifp/uint64.go b/pkg/obifp/uint64.go
index b74e088..fd46624 100644
--- a/pkg/obifp/uint64.go
+++ b/pkg/obifp/uint64.go
@@ -94,11 +94,14 @@ func (u Uint64) LeftShift64(n uint, carryIn uint64) (value, carry uint64) {
 
 	case n == 64:
 		return carryIn, u.w0
+
+	case n < 128:
+		return carryIn, u.w0 << (n - 64)
+
 	}
 
 	log.Warnf("Uint64 overflow at LeftShift64(%v, %v)", u, n)
 	return 0, 0
-
 }
 
 // RightShift64 performs a right shift operation on the Uint64 value by n bits, with carry-out to carry.
@@ -117,10 +120,13 @@ func (u Uint64) RightShift64(n uint, carryIn uint64) (value, carry uint64) {
 		return u.w0, 0
 
 	case n < 64:
-		return u.w0>>n | (carryIn & ^((1 << (64 - n)) - 1)), u.w0 << (n - 64)
+		return u.w0>>n | (carryIn & ^((1 << (64 - n)) - 1)), u.w0 << (64 - n)
 
 	case n == 64:
 		return carryIn, u.w0
+
+	case n < 128:
+		return carryIn, u.w0 >> (n - 64)
 	}
 
 	log.Warnf("Uint64 overflow at RightShift64(%v, %v)", u, n)
diff --git a/pkg/obikmer/encodefourmer.go b/pkg/obikmer/encodefourmer.go
index 4196cf3..c3be879 100644
--- a/pkg/obikmer/encodefourmer.go
+++ b/pkg/obikmer/encodefourmer.go
@@ -97,8 +97,19 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int
 }
 
 // FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences.
-// The returned values are two integer values. The shift between both the sequences and the count of
-// matching 4mer when this shift is applied between both the sequences.
+//
+// Parameters:
+// - index: A precomputed index of 4mer positions in a reference sequence.
+// - shifts: A map to store the shift and count of matching 4mers.
+// - lindex: The length of the indexed reference sequence.
+// - seq: The sequence to be compared with the reference sequence.
+// - relscore: A boolean indicating whether to calculate the relative score.
+// - buffer: A byte buffer for encoding the sequence.
+//
+// Return type:
+// - int: The shift between the two sequences with the maximum score.
+// - int: The count of matching 4mers at the maximum score.
+// - float64: The maximum score.
 func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obiseq.BioSequence, relscore bool, buffer *[]byte) (int, int, float64) {
 
 	iternal_buffer := Encode4mer(seq, buffer)
@@ -126,12 +137,15 @@ func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obise
 		score := float64(count)
 		if relscore {
 			over := -shift
-			if shift > 0 {
+			switch {
+			case shift > 0:
 				over += lindex
-			} else {
+			case shift < 0:
 				over = seq.Len() - over
+			default:
+				over = min(lindex, seq.Len())
 			}
-			score = score / float64(over)
+			score = score / float64(over-3)
 		}
 		if score > maxscore {
 			maxshift = shift
diff --git a/pkg/obikmer/kmermap.go b/pkg/obikmer/kmermap.go
index a2ed037..f154683 100644
--- a/pkg/obikmer/kmermap.go
+++ b/pkg/obikmer/kmermap.go
@@ -2,6 +2,8 @@ package obikmer
 
 import (
 	"os"
+	"sort"
+	"unsafe"
 
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
@@ -55,7 +57,12 @@ func (k *KmerMap[T]) KmerAsString(kmer T) string {
 
 func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T {
 
+	if sequence.Len() < int(k.Kmersize) {
+		return nil
+	}
+
 	makeSparseAt := func(kmer T) T {
+
 		if k.SparseAt == -1 {
 			return kmer
 		}
@@ -65,10 +72,8 @@ func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T
 
 	normalizedKmer := func(fw, rv T) T {
 
-		if k.SparseAt >= 0 {
-			fw = makeSparseAt(fw)
-			rv = makeSparseAt(rv)
-		}
+		fw = makeSparseAt(fw)
+		rv = makeSparseAt(rv)
 
 		if fw.LessThan(rv) {
 			return fw
@@ -118,35 +123,62 @@ func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T
 			rep = append(rep, kmer)
 			size--
 		}
+
 	}
 
 	return rep
 }
 
-func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence) {
+func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence, maxocc ...int) {
+	maxoccurs := -1
+	if len(maxocc) > 0 {
+		maxoccurs = maxocc[0]
+	}
 	kmers := k.NormalizedKmerSlice(sequence, nil)
 	for _, kmer := range kmers {
-		k.index[kmer] = append(k.index[kmer], sequence)
+		seqs := k.index[kmer]
+		if maxoccurs == -1 || len(seqs) <= maxoccurs {
+			seqs = append(seqs, sequence)
+			k.index[kmer] = seqs
+		}
 	}
 }
 
 func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch {
 	kmers := k.NormalizedKmerSlice(sequence, nil)
+	seqs := make([]*obiseq.BioSequence, 0)
 	rep := make(KmerMatch)
 
 	for _, kmer := range kmers {
-		if _, ok := k.index[kmer]; ok {
-			for _, seq := range k.index[kmer] {
-				if seq != sequence {
-					if _, ok := rep[seq]; !ok {
-						rep[seq] = 0
-					}
-					rep[seq]++
-				}
-			}
+		if candidates, ok := k.index[kmer]; ok {
+			seqs = append(seqs, candidates...)
 		}
 	}
 
+	sort.Slice(seqs,
+		func(i, j int) bool {
+			return uintptr(unsafe.Pointer(seqs[i])) < uintptr(unsafe.Pointer(seqs[j]))
+		})
+
+	prevseq := (*obiseq.BioSequence)(nil)
+	n := 0
+	for _, seq := range seqs {
+
+		if seq != prevseq {
+			if prevseq != nil && prevseq != sequence {
+				rep[prevseq] = n
+			}
+			n = 1
+			prevseq = seq
+		}
+
+		n++
+	}
+
+	if prevseq != nil {
+		rep[prevseq] = n
+	}
+
 	return rep
 }
 
@@ -187,7 +219,8 @@ func (k *KmerMatch) Max() *obiseq.BioSequence {
 func NewKmerMap[T obifp.FPUint[T]](
 	sequences obiseq.BioSequenceSlice,
 	kmersize uint,
-	sparse bool) *KmerMap[T] {
+	sparse bool,
+	maxoccurs int) *KmerMap[T] {
 	idx := make(map[T][]*obiseq.BioSequence)
 
 	sparseAt := -1
@@ -246,11 +279,19 @@ func NewKmerMap[T obifp.FPUint[T]](
 	bar := progressbar.NewOptions(n, pbopt...)
 
 	for i, sequence := range sequences {
-		kmap.Push(sequence)
+		kmap.Push(sequence, maxoccurs)
 		if i%100 == 0 {
 			bar.Add(100)
 		}
 	}
 
+	if maxoccurs >= 0 {
+		for k, s := range kmap.index {
+			if len(s) >= maxoccurs {
+				delete(kmap.index, k)
+			}
+		}
+	}
+
 	return kmap
 }
diff --git a/pkg/obioptions/options.go b/pkg/obioptions/options.go
index 773dcf3..e8b45e8 100644
--- a/pkg/obioptions/options.go
+++ b/pkg/obioptions/options.go
@@ -14,8 +14,8 @@ import (
 )
 
 var _Debug = false
-var _WorkerPerCore = 2.0
-var _ReadWorkerPerCore = 0.5
+var _WorkerPerCore = 1.0
+var _ReadWorkerPerCore = 0.25
 var _WriteWorkerPerCore = 0.25
 var _StrictReadWorker = 0
 var _StrictWriteWorker = 0
diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go
index 2f26c42..dc37a88 100644
--- a/pkg/obioptions/version.go
+++ b/pkg/obioptions/version.go
@@ -7,7 +7,7 @@ import (
 // TODO: The version number is extracted from git. This induces that the version
 // corresponds to the last commit, and not the one when the file will be
 // commited
-var _Commit = "bdb96dd"
+var _Commit = "31bfc88"
 var _Version = "Release 4.2.0"
 
 // Version returns the version of the obitools package.
diff --git a/pkg/obitools/obikmersim/obikmersim.go b/pkg/obitools/obikmersim/obikmersim.go
index 06c02e7..b0b4a7c 100644
--- a/pkg/obitools/obikmersim/obikmersim.go
+++ b/pkg/obitools/obikmersim/obikmersim.go
@@ -27,6 +27,7 @@ func MakeCountMatchWorker[T obifp.FPUint[T]](k *obikmer.KmerMap[T], minKmerCount
 		sequence.SetAttribute("obikmer_match_count", n)
 		sequence.SetAttribute("obikmer_kmer_size", k.Kmersize)
 		sequence.SetAttribute("obikmer_sparse_kmer", k.SparseAt >= 0)
+
 		return obiseq.BioSequenceSlice{sequence}, nil
 	}
 }
@@ -46,17 +47,19 @@ func MakeKmerAlignWorker[T obifp.FPUint[T]](
 		slice := obiseq.NewBioSequenceSlice(matches.Len())
 		*slice = (*slice)[:0]
 
-		for _, seq := range matches.Sequences() {
+		candidates := matches.Sequences()
+		n := candidates.Len()
+		for _, seq := range candidates {
 			idmatched_id := seq.Id()
 
-			score, path, fastcount, over, fastscore, reverse := obialign.ReadAlign(
+			score, path, fastcount, over, fastscore, directAlignment := obialign.ReadAlign(
 				sequence, seq,
 				gap, scale, delta,
 				fastScoreRel,
 				arena, &shifts,
 			)
 
-			if reverse {
+			if !directAlignment {
 				idmatched_id = idmatched_id + "-rev"
 				seq = seq.ReverseComplement(false)
 			}
@@ -75,17 +78,19 @@ func MakeKmerAlignWorker[T obifp.FPUint[T]](
 				identity = 0
 			}
 
-			rep := sequence.Copy()
+			rep := cons
 
+			rep.SetAttribute("obikmer_match_count", n)
 			rep.SetAttribute("obikmer_match_id", idmatched_id)
 			rep.SetAttribute("obikmer_fast_count", fastcount)
 			rep.SetAttribute("obikmer_fast_overlap", over)
 			rep.SetAttribute("obikmer_fast_score", math.Round(fastscore*1000)/1000)
+			rep.SetAttribute("seq_length", cons.Len())
 
-			if reverse {
-				rep.SetAttribute("obikmer_orientation", "reverse")
-			} else {
+			if directAlignment {
 				rep.SetAttribute("obikmer_orientation", "forward")
+			} else {
+				rep.SetAttribute("obikmer_orientation", "reverse")
 			}
 
 			if aliLength >= int(k.KmerSize()) && identity >= minIdentity {
@@ -137,16 +142,21 @@ func CLILookForSharedKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
 		iterator = obiiter.IBatchOver(source, references, obioptions.CLIBatchSize())
 	}
 
-	kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode())
+	if CLISelf() {
+		iterator = iterator.Speed("Counting similar reads", references.Len())
+	} else {
+		iterator = iterator.Speed("Counting similar reads")
+	}
+
+	kmerMatch := obikmer.NewKmerMap[obifp.Uint128](
+		references,
+		uint(CLIKmerSize()),
+		CLISparseMode(),
+		CLIMaxKmerOccurs())
+
 	worker := MakeCountMatchWorker(kmerMatch, CLIMinSharedKmers())
 	newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
 
-	if CLISelf() {
-		newIter = newIter.Speed("Counting similar reads", references.Len())
-	} else {
-		newIter = newIter.Speed("Counting similar reads")
-	}
-
 	return newIter.FilterEmpty()
 }
 
@@ -164,7 +174,11 @@ func CLIAlignSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
 	} else {
 		iterator = iterator.Speed("Aligning reads")
 	}
-	kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode())
+	kmerMatch := obikmer.NewKmerMap[obifp.Uint128](
+		references,
+		uint(CLIKmerSize()),
+		CLISparseMode(),
+		CLIMaxKmerOccurs())
 	worker := MakeKmerAlignWorker(kmerMatch, CLIMinSharedKmers(), CLIGap(), CLIScale(), CLIDelta(), CLIFastRelativeScore(), 0.8, true)
 	newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
 
diff --git a/pkg/obitools/obikmersim/options.go b/pkg/obitools/obikmersim/options.go
index fe6bd1d..de36922 100644
--- a/pkg/obitools/obikmersim/options.go
+++ b/pkg/obitools/obikmersim/options.go
@@ -18,6 +18,7 @@ var _Delta = 5
 var _PenalityScale = 1.0
 var _GapPenality = 2.0
 var _FastScoreAbs = false
+var _KmerMaxOccur = -1
 
 // PCROptionSet defines every options related to a simulated PCR.
 //
@@ -46,6 +47,10 @@ func KmerSimCountOptionSet(options *getoptions.GetOpt) {
 		options.Alias("m"),
 		options.Description("Minimum number of shared kmers between two sequences."))
 
+	options.IntVar(&_KmerMaxOccur, "max-kmers", _KmerMaxOccur,
+		options.Alias("M"),
+		options.Description("Maximum number of occurrence of a kmer."))
+
 	options.BoolVar(&_Self, "self", _Self,
 		options.Alias("s"),
 		options.Description("Compare references with themselves."))
@@ -138,3 +143,7 @@ func CLIGap() float64 {
 func CLIFastRelativeScore() bool {
 	return !_FastScoreAbs
 }
+
+func CLIMaxKmerOccurs() int {
+	return _KmerMaxOccur
+}