On development genome skim tools

This commit is contained in:
Eric Coissac
2024-08-30 11:17:33 +02:00
parent cd330db672
commit 373464cb06
10 changed files with 144 additions and 50 deletions

View File

@ -44,10 +44,10 @@ func main() {
obiconvert.OpenSequenceDataErrorMessage(args, err) obiconvert.OpenSequenceDataErrorMessage(args, err)
selected := obikmersim.CLILookForSharedKmers(sequences) counted := obikmersim.CLILookForSharedKmers(sequences)
topull, err := obiconvert.CLIWriteBioSequences(selected, false) topull, err := obiconvert.CLIWriteBioSequences(counted, false)
if err == nil { if err != nil {
log.Panic(err) log.Panic(err)
} }

View File

@ -43,16 +43,25 @@ func ReadAlign(seqA, seqB *obiseq.BioSequence,
directAlignment = false directAlignment = false
} }
if shift > 0 { // Compute the overlapping region length
switch {
case shift > 0:
over = seqA.Len() - shift over = seqA.Len() - shift
} else { case shift < 0:
over = seqB.Len() + shift over = seqB.Len() + shift
default:
over = min(seqA.Len(), seqB.Len())
} }
// log.Warnf("fw/fw: %v shift=%d fastCount=%d/over=%d fastScore=%f",
// directAlignment, shift, fastCount, over, fastScore)
// log.Warnf(("seqA: %s\nseqB: %s\n"), seqA.String(), seqB.String())
// At least one mismatch exists in the overlaping region // At least one mismatch exists in the overlaping region
if fastCount+3 < over { if fastCount+3 < over {
if shift > 0 { if shift > 0 || (shift == 0 && seqB.Len() >= seqA.Len()) {
startA = shift - delta startA = shift - delta
if startA < 0 { if startA < 0 {
startA = 0 startA = 0
@ -105,7 +114,7 @@ func ReadAlign(seqA, seqB *obiseq.BioSequence,
// Both overlaping regions are identicals // Both overlaping regions are identicals
if shift > 0 { if shift > 0 || (shift == 0 && seqB.Len() >= seqA.Len()) {
startA = shift startA = shift
startB = 0 startB = 0
extra5 = -startA extra5 = -startA
@ -123,6 +132,7 @@ func ReadAlign(seqA, seqB *obiseq.BioSequence,
extra3 = partLen - seqA.Len() extra3 = partLen - seqA.Len()
qualSeqA = seqA.Qualities()[:partLen] qualSeqA = seqA.Qualities()[:partLen]
} }
score = 0 score = 0
for i, qualA := range qualSeqA { for i, qualA := range qualSeqA {
qualB := qualSeqB[i] qualB := qualSeqB[i]

View File

@ -145,7 +145,7 @@ func (u Uint128) Mul(v Uint128) Uint128 {
p2, p3 := bits.Mul64(u.w0, v.w1) p2, p3 := bits.Mul64(u.w0, v.w1)
hi, c0 := bits.Add64(hi, p1, 0) hi, c0 := bits.Add64(hi, p1, 0)
hi, c1 := bits.Add64(hi, p3, c0) hi, c1 := bits.Add64(hi, p3, c0)
if (u.w1 != 0 && v.w1 != 0) || p0 != 0 || p2 != 0 || c1 != 0 { if p0 != 0 || p2 != 0 || c1 != 0 {
log.Panicf("Uint128 overflow at Mul(%v, %v)", u, v) log.Panicf("Uint128 overflow at Mul(%v, %v)", u, v)
} }
return Uint128{w1: hi, w0: lo} return Uint128{w1: hi, w0: lo}

View File

@ -94,11 +94,14 @@ func (u Uint64) LeftShift64(n uint, carryIn uint64) (value, carry uint64) {
case n == 64: case n == 64:
return carryIn, u.w0 return carryIn, u.w0
case n < 128:
return carryIn, u.w0 << (n - 64)
} }
log.Warnf("Uint64 overflow at LeftShift64(%v, %v)", u, n) log.Warnf("Uint64 overflow at LeftShift64(%v, %v)", u, n)
return 0, 0 return 0, 0
} }
// RightShift64 performs a right shift operation on the Uint64 value by n bits, with carry-out to carry. // RightShift64 performs a right shift operation on the Uint64 value by n bits, with carry-out to carry.
@ -117,10 +120,13 @@ func (u Uint64) RightShift64(n uint, carryIn uint64) (value, carry uint64) {
return u.w0, 0 return u.w0, 0
case n < 64: case n < 64:
return u.w0>>n | (carryIn & ^((1 << (64 - n)) - 1)), u.w0 << (n - 64) return u.w0>>n | (carryIn & ^((1 << (64 - n)) - 1)), u.w0 << (64 - n)
case n == 64: case n == 64:
return carryIn, u.w0 return carryIn, u.w0
case n < 128:
return carryIn, u.w0 >> (n - 64)
} }
log.Warnf("Uint64 overflow at RightShift64(%v, %v)", u, n) log.Warnf("Uint64 overflow at RightShift64(%v, %v)", u, n)

View File

@ -97,8 +97,19 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int
} }
// FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences. // FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences.
// The returned values are two integer values. The shift between both the sequences and the count of //
// matching 4mer when this shift is applied between both the sequences. // Parameters:
// - index: A precomputed index of 4mer positions in a reference sequence.
// - shifts: A map to store the shift and count of matching 4mers.
// - lindex: The length of the indexed reference sequence.
// - seq: The sequence to be compared with the reference sequence.
// - relscore: A boolean indicating whether to calculate the relative score.
// - buffer: A byte buffer for encoding the sequence.
//
// Return type:
// - int: The shift between the two sequences with the maximum score.
// - int: The count of matching 4mers at the maximum score.
// - float64: The maximum score.
func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obiseq.BioSequence, relscore bool, buffer *[]byte) (int, int, float64) { func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obiseq.BioSequence, relscore bool, buffer *[]byte) (int, int, float64) {
iternal_buffer := Encode4mer(seq, buffer) iternal_buffer := Encode4mer(seq, buffer)
@ -126,12 +137,15 @@ func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obise
score := float64(count) score := float64(count)
if relscore { if relscore {
over := -shift over := -shift
if shift > 0 { switch {
case shift > 0:
over += lindex over += lindex
} else { case shift < 0:
over = seq.Len() - over over = seq.Len() - over
default:
over = min(lindex, seq.Len())
} }
score = score / float64(over) score = score / float64(over-3)
} }
if score > maxscore { if score > maxscore {
maxshift = shift maxshift = shift

View File

@ -2,6 +2,8 @@ package obikmer
import ( import (
"os" "os"
"sort"
"unsafe"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
@ -55,7 +57,12 @@ func (k *KmerMap[T]) KmerAsString(kmer T) string {
func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T { func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T {
if sequence.Len() < int(k.Kmersize) {
return nil
}
makeSparseAt := func(kmer T) T { makeSparseAt := func(kmer T) T {
if k.SparseAt == -1 { if k.SparseAt == -1 {
return kmer return kmer
} }
@ -65,10 +72,8 @@ func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T
normalizedKmer := func(fw, rv T) T { normalizedKmer := func(fw, rv T) T {
if k.SparseAt >= 0 { fw = makeSparseAt(fw)
fw = makeSparseAt(fw) rv = makeSparseAt(rv)
rv = makeSparseAt(rv)
}
if fw.LessThan(rv) { if fw.LessThan(rv) {
return fw return fw
@ -118,35 +123,62 @@ func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T
rep = append(rep, kmer) rep = append(rep, kmer)
size-- size--
} }
} }
return rep return rep
} }
func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence) { func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence, maxocc ...int) {
maxoccurs := -1
if len(maxocc) > 0 {
maxoccurs = maxocc[0]
}
kmers := k.NormalizedKmerSlice(sequence, nil) kmers := k.NormalizedKmerSlice(sequence, nil)
for _, kmer := range kmers { for _, kmer := range kmers {
k.index[kmer] = append(k.index[kmer], sequence) seqs := k.index[kmer]
if maxoccurs == -1 || len(seqs) <= maxoccurs {
seqs = append(seqs, sequence)
k.index[kmer] = seqs
}
} }
} }
func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch { func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch {
kmers := k.NormalizedKmerSlice(sequence, nil) kmers := k.NormalizedKmerSlice(sequence, nil)
seqs := make([]*obiseq.BioSequence, 0)
rep := make(KmerMatch) rep := make(KmerMatch)
for _, kmer := range kmers { for _, kmer := range kmers {
if _, ok := k.index[kmer]; ok { if candidates, ok := k.index[kmer]; ok {
for _, seq := range k.index[kmer] { seqs = append(seqs, candidates...)
if seq != sequence {
if _, ok := rep[seq]; !ok {
rep[seq] = 0
}
rep[seq]++
}
}
} }
} }
sort.Slice(seqs,
func(i, j int) bool {
return uintptr(unsafe.Pointer(seqs[i])) < uintptr(unsafe.Pointer(seqs[j]))
})
prevseq := (*obiseq.BioSequence)(nil)
n := 0
for _, seq := range seqs {
if seq != prevseq {
if prevseq != nil && prevseq != sequence {
rep[prevseq] = n
}
n = 1
prevseq = seq
}
n++
}
if prevseq != nil {
rep[prevseq] = n
}
return rep return rep
} }
@ -187,7 +219,8 @@ func (k *KmerMatch) Max() *obiseq.BioSequence {
func NewKmerMap[T obifp.FPUint[T]]( func NewKmerMap[T obifp.FPUint[T]](
sequences obiseq.BioSequenceSlice, sequences obiseq.BioSequenceSlice,
kmersize uint, kmersize uint,
sparse bool) *KmerMap[T] { sparse bool,
maxoccurs int) *KmerMap[T] {
idx := make(map[T][]*obiseq.BioSequence) idx := make(map[T][]*obiseq.BioSequence)
sparseAt := -1 sparseAt := -1
@ -246,11 +279,19 @@ func NewKmerMap[T obifp.FPUint[T]](
bar := progressbar.NewOptions(n, pbopt...) bar := progressbar.NewOptions(n, pbopt...)
for i, sequence := range sequences { for i, sequence := range sequences {
kmap.Push(sequence) kmap.Push(sequence, maxoccurs)
if i%100 == 0 { if i%100 == 0 {
bar.Add(100) bar.Add(100)
} }
} }
if maxoccurs >= 0 {
for k, s := range kmap.index {
if len(s) >= maxoccurs {
delete(kmap.index, k)
}
}
}
return kmap return kmap
} }

View File

@ -14,8 +14,8 @@ import (
) )
var _Debug = false var _Debug = false
var _WorkerPerCore = 2.0 var _WorkerPerCore = 1.0
var _ReadWorkerPerCore = 0.5 var _ReadWorkerPerCore = 0.25
var _WriteWorkerPerCore = 0.25 var _WriteWorkerPerCore = 0.25
var _StrictReadWorker = 0 var _StrictReadWorker = 0
var _StrictWriteWorker = 0 var _StrictWriteWorker = 0

View File

@ -7,7 +7,7 @@ import (
// TODO: The version number is extracted from git. This induces that the version // TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "bdb96dd" var _Commit = "31bfc88"
var _Version = "Release 4.2.0" var _Version = "Release 4.2.0"
// Version returns the version of the obitools package. // Version returns the version of the obitools package.

View File

@ -27,6 +27,7 @@ func MakeCountMatchWorker[T obifp.FPUint[T]](k *obikmer.KmerMap[T], minKmerCount
sequence.SetAttribute("obikmer_match_count", n) sequence.SetAttribute("obikmer_match_count", n)
sequence.SetAttribute("obikmer_kmer_size", k.Kmersize) sequence.SetAttribute("obikmer_kmer_size", k.Kmersize)
sequence.SetAttribute("obikmer_sparse_kmer", k.SparseAt >= 0) sequence.SetAttribute("obikmer_sparse_kmer", k.SparseAt >= 0)
return obiseq.BioSequenceSlice{sequence}, nil return obiseq.BioSequenceSlice{sequence}, nil
} }
} }
@ -46,17 +47,19 @@ func MakeKmerAlignWorker[T obifp.FPUint[T]](
slice := obiseq.NewBioSequenceSlice(matches.Len()) slice := obiseq.NewBioSequenceSlice(matches.Len())
*slice = (*slice)[:0] *slice = (*slice)[:0]
for _, seq := range matches.Sequences() { candidates := matches.Sequences()
n := candidates.Len()
for _, seq := range candidates {
idmatched_id := seq.Id() idmatched_id := seq.Id()
score, path, fastcount, over, fastscore, reverse := obialign.ReadAlign( score, path, fastcount, over, fastscore, directAlignment := obialign.ReadAlign(
sequence, seq, sequence, seq,
gap, scale, delta, gap, scale, delta,
fastScoreRel, fastScoreRel,
arena, &shifts, arena, &shifts,
) )
if reverse { if !directAlignment {
idmatched_id = idmatched_id + "-rev" idmatched_id = idmatched_id + "-rev"
seq = seq.ReverseComplement(false) seq = seq.ReverseComplement(false)
} }
@ -75,17 +78,19 @@ func MakeKmerAlignWorker[T obifp.FPUint[T]](
identity = 0 identity = 0
} }
rep := sequence.Copy() rep := cons
rep.SetAttribute("obikmer_match_count", n)
rep.SetAttribute("obikmer_match_id", idmatched_id) rep.SetAttribute("obikmer_match_id", idmatched_id)
rep.SetAttribute("obikmer_fast_count", fastcount) rep.SetAttribute("obikmer_fast_count", fastcount)
rep.SetAttribute("obikmer_fast_overlap", over) rep.SetAttribute("obikmer_fast_overlap", over)
rep.SetAttribute("obikmer_fast_score", math.Round(fastscore*1000)/1000) rep.SetAttribute("obikmer_fast_score", math.Round(fastscore*1000)/1000)
rep.SetAttribute("seq_length", cons.Len())
if reverse { if directAlignment {
rep.SetAttribute("obikmer_orientation", "reverse")
} else {
rep.SetAttribute("obikmer_orientation", "forward") rep.SetAttribute("obikmer_orientation", "forward")
} else {
rep.SetAttribute("obikmer_orientation", "reverse")
} }
if aliLength >= int(k.KmerSize()) && identity >= minIdentity { if aliLength >= int(k.KmerSize()) && identity >= minIdentity {
@ -137,16 +142,21 @@ func CLILookForSharedKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
iterator = obiiter.IBatchOver(source, references, obioptions.CLIBatchSize()) iterator = obiiter.IBatchOver(source, references, obioptions.CLIBatchSize())
} }
kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode()) if CLISelf() {
iterator = iterator.Speed("Counting similar reads", references.Len())
} else {
iterator = iterator.Speed("Counting similar reads")
}
kmerMatch := obikmer.NewKmerMap[obifp.Uint128](
references,
uint(CLIKmerSize()),
CLISparseMode(),
CLIMaxKmerOccurs())
worker := MakeCountMatchWorker(kmerMatch, CLIMinSharedKmers()) worker := MakeCountMatchWorker(kmerMatch, CLIMinSharedKmers())
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers()) newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
if CLISelf() {
newIter = newIter.Speed("Counting similar reads", references.Len())
} else {
newIter = newIter.Speed("Counting similar reads")
}
return newIter.FilterEmpty() return newIter.FilterEmpty()
} }
@ -164,7 +174,11 @@ func CLIAlignSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
} else { } else {
iterator = iterator.Speed("Aligning reads") iterator = iterator.Speed("Aligning reads")
} }
kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode()) kmerMatch := obikmer.NewKmerMap[obifp.Uint128](
references,
uint(CLIKmerSize()),
CLISparseMode(),
CLIMaxKmerOccurs())
worker := MakeKmerAlignWorker(kmerMatch, CLIMinSharedKmers(), CLIGap(), CLIScale(), CLIDelta(), CLIFastRelativeScore(), 0.8, true) worker := MakeKmerAlignWorker(kmerMatch, CLIMinSharedKmers(), CLIGap(), CLIScale(), CLIDelta(), CLIFastRelativeScore(), 0.8, true)
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers()) newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())

View File

@ -18,6 +18,7 @@ var _Delta = 5
var _PenalityScale = 1.0 var _PenalityScale = 1.0
var _GapPenality = 2.0 var _GapPenality = 2.0
var _FastScoreAbs = false var _FastScoreAbs = false
var _KmerMaxOccur = -1
// PCROptionSet defines every options related to a simulated PCR. // PCROptionSet defines every options related to a simulated PCR.
// //
@ -46,6 +47,10 @@ func KmerSimCountOptionSet(options *getoptions.GetOpt) {
options.Alias("m"), options.Alias("m"),
options.Description("Minimum number of shared kmers between two sequences.")) options.Description("Minimum number of shared kmers between two sequences."))
options.IntVar(&_KmerMaxOccur, "max-kmers", _KmerMaxOccur,
options.Alias("M"),
options.Description("Maximum number of occurrence of a kmer."))
options.BoolVar(&_Self, "self", _Self, options.BoolVar(&_Self, "self", _Self,
options.Alias("s"), options.Alias("s"),
options.Description("Compare references with themselves.")) options.Description("Compare references with themselves."))
@ -138,3 +143,7 @@ func CLIGap() float64 {
func CLIFastRelativeScore() bool { func CLIFastRelativeScore() bool {
return !_FastScoreAbs return !_FastScoreAbs
} }
func CLIMaxKmerOccurs() int {
return _KmerMaxOccur
}