On development genome skim tools

This commit is contained in:
Eric Coissac
2024-08-30 11:17:33 +02:00
parent cd330db672
commit 373464cb06
10 changed files with 144 additions and 50 deletions

View File

@@ -27,6 +27,7 @@ func MakeCountMatchWorker[T obifp.FPUint[T]](k *obikmer.KmerMap[T], minKmerCount
sequence.SetAttribute("obikmer_match_count", n)
sequence.SetAttribute("obikmer_kmer_size", k.Kmersize)
sequence.SetAttribute("obikmer_sparse_kmer", k.SparseAt >= 0)
return obiseq.BioSequenceSlice{sequence}, nil
}
}
@@ -46,17 +47,19 @@ func MakeKmerAlignWorker[T obifp.FPUint[T]](
slice := obiseq.NewBioSequenceSlice(matches.Len())
*slice = (*slice)[:0]
for _, seq := range matches.Sequences() {
candidates := matches.Sequences()
n := candidates.Len()
for _, seq := range candidates {
idmatched_id := seq.Id()
score, path, fastcount, over, fastscore, reverse := obialign.ReadAlign(
score, path, fastcount, over, fastscore, directAlignment := obialign.ReadAlign(
sequence, seq,
gap, scale, delta,
fastScoreRel,
arena, &shifts,
)
if reverse {
if !directAlignment {
idmatched_id = idmatched_id + "-rev"
seq = seq.ReverseComplement(false)
}
@@ -75,17 +78,19 @@ func MakeKmerAlignWorker[T obifp.FPUint[T]](
identity = 0
}
rep := sequence.Copy()
rep := cons
rep.SetAttribute("obikmer_match_count", n)
rep.SetAttribute("obikmer_match_id", idmatched_id)
rep.SetAttribute("obikmer_fast_count", fastcount)
rep.SetAttribute("obikmer_fast_overlap", over)
rep.SetAttribute("obikmer_fast_score", math.Round(fastscore*1000)/1000)
rep.SetAttribute("seq_length", cons.Len())
if reverse {
rep.SetAttribute("obikmer_orientation", "reverse")
} else {
if directAlignment {
rep.SetAttribute("obikmer_orientation", "forward")
} else {
rep.SetAttribute("obikmer_orientation", "reverse")
}
if aliLength >= int(k.KmerSize()) && identity >= minIdentity {
@@ -137,16 +142,21 @@ func CLILookForSharedKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
iterator = obiiter.IBatchOver(source, references, obioptions.CLIBatchSize())
}
kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode())
if CLISelf() {
iterator = iterator.Speed("Counting similar reads", references.Len())
} else {
iterator = iterator.Speed("Counting similar reads")
}
kmerMatch := obikmer.NewKmerMap[obifp.Uint128](
references,
uint(CLIKmerSize()),
CLISparseMode(),
CLIMaxKmerOccurs())
worker := MakeCountMatchWorker(kmerMatch, CLIMinSharedKmers())
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
if CLISelf() {
newIter = newIter.Speed("Counting similar reads", references.Len())
} else {
newIter = newIter.Speed("Counting similar reads")
}
return newIter.FilterEmpty()
}
@@ -164,7 +174,11 @@ func CLIAlignSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
} else {
iterator = iterator.Speed("Aligning reads")
}
kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode())
kmerMatch := obikmer.NewKmerMap[obifp.Uint128](
references,
uint(CLIKmerSize()),
CLISparseMode(),
CLIMaxKmerOccurs())
worker := MakeKmerAlignWorker(kmerMatch, CLIMinSharedKmers(), CLIGap(), CLIScale(), CLIDelta(), CLIFastRelativeScore(), 0.8, true)
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())

View File

@@ -18,6 +18,7 @@ var _Delta = 5
var _PenalityScale = 1.0
var _GapPenality = 2.0
var _FastScoreAbs = false
var _KmerMaxOccur = -1
// PCROptionSet defines every options related to a simulated PCR.
//
@@ -46,6 +47,10 @@ func KmerSimCountOptionSet(options *getoptions.GetOpt) {
options.Alias("m"),
options.Description("Minimum number of shared kmers between two sequences."))
options.IntVar(&_KmerMaxOccur, "max-kmers", _KmerMaxOccur,
options.Alias("M"),
options.Description("Maximum number of occurrence of a kmer."))
options.BoolVar(&_Self, "self", _Self,
options.Alias("s"),
options.Description("Compare references with themselves."))
@@ -138,3 +143,7 @@ func CLIGap() float64 {
func CLIFastRelativeScore() bool {
return !_FastScoreAbs
}
func CLIMaxKmerOccurs() int {
return _KmerMaxOccur
}