From 373464cb0623a80ef906049487be63432ca20772 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 30 Aug 2024 11:17:33 +0200 Subject: [PATCH] On development genome skim tools --- cmd/obitools/obikmersimcount/main.go | 6 +-- pkg/obialign/readalign.go | 18 +++++-- pkg/obifp/uint128.go | 2 +- pkg/obifp/uint64.go | 10 +++- pkg/obikmer/encodefourmer.go | 24 +++++++-- pkg/obikmer/kmermap.go | 75 +++++++++++++++++++++------ pkg/obioptions/options.go | 4 +- pkg/obioptions/version.go | 2 +- pkg/obitools/obikmersim/obikmersim.go | 44 ++++++++++------ pkg/obitools/obikmersim/options.go | 9 ++++ 10 files changed, 144 insertions(+), 50 deletions(-) diff --git a/cmd/obitools/obikmersimcount/main.go b/cmd/obitools/obikmersimcount/main.go index a0496c1..d155ef8 100644 --- a/cmd/obitools/obikmersimcount/main.go +++ b/cmd/obitools/obikmersimcount/main.go @@ -44,10 +44,10 @@ func main() { obiconvert.OpenSequenceDataErrorMessage(args, err) - selected := obikmersim.CLILookForSharedKmers(sequences) - topull, err := obiconvert.CLIWriteBioSequences(selected, false) + counted := obikmersim.CLILookForSharedKmers(sequences) + topull, err := obiconvert.CLIWriteBioSequences(counted, false) - if err == nil { + if err != nil { log.Panic(err) } diff --git a/pkg/obialign/readalign.go b/pkg/obialign/readalign.go index 10a2e65..4c1136a 100644 --- a/pkg/obialign/readalign.go +++ b/pkg/obialign/readalign.go @@ -43,16 +43,25 @@ func ReadAlign(seqA, seqB *obiseq.BioSequence, directAlignment = false } - if shift > 0 { + // Compute the overlapping region length + switch { + case shift > 0: over = seqA.Len() - shift - } else { + case shift < 0: over = seqB.Len() + shift + default: + over = min(seqA.Len(), seqB.Len()) } + // log.Warnf("fw/fw: %v shift=%d fastCount=%d/over=%d fastScore=%f", + // directAlignment, shift, fastCount, over, fastScore) + + // log.Warnf(("seqA: %s\nseqB: %s\n"), seqA.String(), seqB.String()) + // At least one mismatch exists in the overlaping region if fastCount+3 < over { - if shift > 0 { + if shift > 0 || (shift == 0 && seqB.Len() >= seqA.Len()) { startA = shift - delta if startA < 0 { startA = 0 @@ -105,7 +114,7 @@ func ReadAlign(seqA, seqB *obiseq.BioSequence, // Both overlaping regions are identicals - if shift > 0 { + if shift > 0 || (shift == 0 && seqB.Len() >= seqA.Len()) { startA = shift startB = 0 extra5 = -startA @@ -123,6 +132,7 @@ func ReadAlign(seqA, seqB *obiseq.BioSequence, extra3 = partLen - seqA.Len() qualSeqA = seqA.Qualities()[:partLen] } + score = 0 for i, qualA := range qualSeqA { qualB := qualSeqB[i] diff --git a/pkg/obifp/uint128.go b/pkg/obifp/uint128.go index 0ecdfc5..2c62928 100644 --- a/pkg/obifp/uint128.go +++ b/pkg/obifp/uint128.go @@ -145,7 +145,7 @@ func (u Uint128) Mul(v Uint128) Uint128 { p2, p3 := bits.Mul64(u.w0, v.w1) hi, c0 := bits.Add64(hi, p1, 0) hi, c1 := bits.Add64(hi, p3, c0) - if (u.w1 != 0 && v.w1 != 0) || p0 != 0 || p2 != 0 || c1 != 0 { + if p0 != 0 || p2 != 0 || c1 != 0 { log.Panicf("Uint128 overflow at Mul(%v, %v)", u, v) } return Uint128{w1: hi, w0: lo} diff --git a/pkg/obifp/uint64.go b/pkg/obifp/uint64.go index b74e088..fd46624 100644 --- a/pkg/obifp/uint64.go +++ b/pkg/obifp/uint64.go @@ -94,11 +94,14 @@ func (u Uint64) LeftShift64(n uint, carryIn uint64) (value, carry uint64) { case n == 64: return carryIn, u.w0 + + case n < 128: + return carryIn, u.w0 << (n - 64) + } log.Warnf("Uint64 overflow at LeftShift64(%v, %v)", u, n) return 0, 0 - } // RightShift64 performs a right shift operation on the Uint64 value by n bits, with carry-out to carry. @@ -117,10 +120,13 @@ func (u Uint64) RightShift64(n uint, carryIn uint64) (value, carry uint64) { return u.w0, 0 case n < 64: - return u.w0>>n | (carryIn & ^((1 << (64 - n)) - 1)), u.w0 << (n - 64) + return u.w0>>n | (carryIn & ^((1 << (64 - n)) - 1)), u.w0 << (64 - n) case n == 64: return carryIn, u.w0 + + case n < 128: + return carryIn, u.w0 >> (n - 64) } log.Warnf("Uint64 overflow at RightShift64(%v, %v)", u, n) diff --git a/pkg/obikmer/encodefourmer.go b/pkg/obikmer/encodefourmer.go index 4196cf3..c3be879 100644 --- a/pkg/obikmer/encodefourmer.go +++ b/pkg/obikmer/encodefourmer.go @@ -97,8 +97,19 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int } // FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences. -// The returned values are two integer values. The shift between both the sequences and the count of -// matching 4mer when this shift is applied between both the sequences. +// +// Parameters: +// - index: A precomputed index of 4mer positions in a reference sequence. +// - shifts: A map to store the shift and count of matching 4mers. +// - lindex: The length of the indexed reference sequence. +// - seq: The sequence to be compared with the reference sequence. +// - relscore: A boolean indicating whether to calculate the relative score. +// - buffer: A byte buffer for encoding the sequence. +// +// Return type: +// - int: The shift between the two sequences with the maximum score. +// - int: The count of matching 4mers at the maximum score. +// - float64: The maximum score. func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obiseq.BioSequence, relscore bool, buffer *[]byte) (int, int, float64) { iternal_buffer := Encode4mer(seq, buffer) @@ -126,12 +137,15 @@ func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obise score := float64(count) if relscore { over := -shift - if shift > 0 { + switch { + case shift > 0: over += lindex - } else { + case shift < 0: over = seq.Len() - over + default: + over = min(lindex, seq.Len()) } - score = score / float64(over) + score = score / float64(over-3) } if score > maxscore { maxshift = shift diff --git a/pkg/obikmer/kmermap.go b/pkg/obikmer/kmermap.go index a2ed037..f154683 100644 --- a/pkg/obikmer/kmermap.go +++ b/pkg/obikmer/kmermap.go @@ -2,6 +2,8 @@ package obikmer import ( "os" + "sort" + "unsafe" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" @@ -55,7 +57,12 @@ func (k *KmerMap[T]) KmerAsString(kmer T) string { func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T { + if sequence.Len() < int(k.Kmersize) { + return nil + } + makeSparseAt := func(kmer T) T { + if k.SparseAt == -1 { return kmer } @@ -65,10 +72,8 @@ func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T normalizedKmer := func(fw, rv T) T { - if k.SparseAt >= 0 { - fw = makeSparseAt(fw) - rv = makeSparseAt(rv) - } + fw = makeSparseAt(fw) + rv = makeSparseAt(rv) if fw.LessThan(rv) { return fw @@ -118,35 +123,62 @@ func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T rep = append(rep, kmer) size-- } + } return rep } -func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence) { +func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence, maxocc ...int) { + maxoccurs := -1 + if len(maxocc) > 0 { + maxoccurs = maxocc[0] + } kmers := k.NormalizedKmerSlice(sequence, nil) for _, kmer := range kmers { - k.index[kmer] = append(k.index[kmer], sequence) + seqs := k.index[kmer] + if maxoccurs == -1 || len(seqs) <= maxoccurs { + seqs = append(seqs, sequence) + k.index[kmer] = seqs + } } } func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch { kmers := k.NormalizedKmerSlice(sequence, nil) + seqs := make([]*obiseq.BioSequence, 0) rep := make(KmerMatch) for _, kmer := range kmers { - if _, ok := k.index[kmer]; ok { - for _, seq := range k.index[kmer] { - if seq != sequence { - if _, ok := rep[seq]; !ok { - rep[seq] = 0 - } - rep[seq]++ - } - } + if candidates, ok := k.index[kmer]; ok { + seqs = append(seqs, candidates...) } } + sort.Slice(seqs, + func(i, j int) bool { + return uintptr(unsafe.Pointer(seqs[i])) < uintptr(unsafe.Pointer(seqs[j])) + }) + + prevseq := (*obiseq.BioSequence)(nil) + n := 0 + for _, seq := range seqs { + + if seq != prevseq { + if prevseq != nil && prevseq != sequence { + rep[prevseq] = n + } + n = 1 + prevseq = seq + } + + n++ + } + + if prevseq != nil { + rep[prevseq] = n + } + return rep } @@ -187,7 +219,8 @@ func (k *KmerMatch) Max() *obiseq.BioSequence { func NewKmerMap[T obifp.FPUint[T]]( sequences obiseq.BioSequenceSlice, kmersize uint, - sparse bool) *KmerMap[T] { + sparse bool, + maxoccurs int) *KmerMap[T] { idx := make(map[T][]*obiseq.BioSequence) sparseAt := -1 @@ -246,11 +279,19 @@ func NewKmerMap[T obifp.FPUint[T]]( bar := progressbar.NewOptions(n, pbopt...) for i, sequence := range sequences { - kmap.Push(sequence) + kmap.Push(sequence, maxoccurs) if i%100 == 0 { bar.Add(100) } } + if maxoccurs >= 0 { + for k, s := range kmap.index { + if len(s) >= maxoccurs { + delete(kmap.index, k) + } + } + } + return kmap } diff --git a/pkg/obioptions/options.go b/pkg/obioptions/options.go index 773dcf3..e8b45e8 100644 --- a/pkg/obioptions/options.go +++ b/pkg/obioptions/options.go @@ -14,8 +14,8 @@ import ( ) var _Debug = false -var _WorkerPerCore = 2.0 -var _ReadWorkerPerCore = 0.5 +var _WorkerPerCore = 1.0 +var _ReadWorkerPerCore = 0.25 var _WriteWorkerPerCore = 0.25 var _StrictReadWorker = 0 var _StrictWriteWorker = 0 diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 2f26c42..dc37a88 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "bdb96dd" +var _Commit = "31bfc88" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obitools/obikmersim/obikmersim.go b/pkg/obitools/obikmersim/obikmersim.go index 06c02e7..b0b4a7c 100644 --- a/pkg/obitools/obikmersim/obikmersim.go +++ b/pkg/obitools/obikmersim/obikmersim.go @@ -27,6 +27,7 @@ func MakeCountMatchWorker[T obifp.FPUint[T]](k *obikmer.KmerMap[T], minKmerCount sequence.SetAttribute("obikmer_match_count", n) sequence.SetAttribute("obikmer_kmer_size", k.Kmersize) sequence.SetAttribute("obikmer_sparse_kmer", k.SparseAt >= 0) + return obiseq.BioSequenceSlice{sequence}, nil } } @@ -46,17 +47,19 @@ func MakeKmerAlignWorker[T obifp.FPUint[T]]( slice := obiseq.NewBioSequenceSlice(matches.Len()) *slice = (*slice)[:0] - for _, seq := range matches.Sequences() { + candidates := matches.Sequences() + n := candidates.Len() + for _, seq := range candidates { idmatched_id := seq.Id() - score, path, fastcount, over, fastscore, reverse := obialign.ReadAlign( + score, path, fastcount, over, fastscore, directAlignment := obialign.ReadAlign( sequence, seq, gap, scale, delta, fastScoreRel, arena, &shifts, ) - if reverse { + if !directAlignment { idmatched_id = idmatched_id + "-rev" seq = seq.ReverseComplement(false) } @@ -75,17 +78,19 @@ func MakeKmerAlignWorker[T obifp.FPUint[T]]( identity = 0 } - rep := sequence.Copy() + rep := cons + rep.SetAttribute("obikmer_match_count", n) rep.SetAttribute("obikmer_match_id", idmatched_id) rep.SetAttribute("obikmer_fast_count", fastcount) rep.SetAttribute("obikmer_fast_overlap", over) rep.SetAttribute("obikmer_fast_score", math.Round(fastscore*1000)/1000) + rep.SetAttribute("seq_length", cons.Len()) - if reverse { - rep.SetAttribute("obikmer_orientation", "reverse") - } else { + if directAlignment { rep.SetAttribute("obikmer_orientation", "forward") + } else { + rep.SetAttribute("obikmer_orientation", "reverse") } if aliLength >= int(k.KmerSize()) && identity >= minIdentity { @@ -137,16 +142,21 @@ func CLILookForSharedKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence { iterator = obiiter.IBatchOver(source, references, obioptions.CLIBatchSize()) } - kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode()) + if CLISelf() { + iterator = iterator.Speed("Counting similar reads", references.Len()) + } else { + iterator = iterator.Speed("Counting similar reads") + } + + kmerMatch := obikmer.NewKmerMap[obifp.Uint128]( + references, + uint(CLIKmerSize()), + CLISparseMode(), + CLIMaxKmerOccurs()) + worker := MakeCountMatchWorker(kmerMatch, CLIMinSharedKmers()) newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers()) - if CLISelf() { - newIter = newIter.Speed("Counting similar reads", references.Len()) - } else { - newIter = newIter.Speed("Counting similar reads") - } - return newIter.FilterEmpty() } @@ -164,7 +174,11 @@ func CLIAlignSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence { } else { iterator = iterator.Speed("Aligning reads") } - kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode()) + kmerMatch := obikmer.NewKmerMap[obifp.Uint128]( + references, + uint(CLIKmerSize()), + CLISparseMode(), + CLIMaxKmerOccurs()) worker := MakeKmerAlignWorker(kmerMatch, CLIMinSharedKmers(), CLIGap(), CLIScale(), CLIDelta(), CLIFastRelativeScore(), 0.8, true) newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers()) diff --git a/pkg/obitools/obikmersim/options.go b/pkg/obitools/obikmersim/options.go index fe6bd1d..de36922 100644 --- a/pkg/obitools/obikmersim/options.go +++ b/pkg/obitools/obikmersim/options.go @@ -18,6 +18,7 @@ var _Delta = 5 var _PenalityScale = 1.0 var _GapPenality = 2.0 var _FastScoreAbs = false +var _KmerMaxOccur = -1 // PCROptionSet defines every options related to a simulated PCR. // @@ -46,6 +47,10 @@ func KmerSimCountOptionSet(options *getoptions.GetOpt) { options.Alias("m"), options.Description("Minimum number of shared kmers between two sequences.")) + options.IntVar(&_KmerMaxOccur, "max-kmers", _KmerMaxOccur, + options.Alias("M"), + options.Description("Maximum number of occurrence of a kmer.")) + options.BoolVar(&_Self, "self", _Self, options.Alias("s"), options.Description("Compare references with themselves.")) @@ -138,3 +143,7 @@ func CLIGap() float64 { func CLIFastRelativeScore() bool { return !_FastScoreAbs } + +func CLIMaxKmerOccurs() int { + return _KmerMaxOccur +}