correction of several small bugs

This commit is contained in:
Eric Coissac
2024-09-03 06:08:07 -03:00
parent 373464cb06
commit 65ae82622e
22 changed files with 770 additions and 79 deletions

View File

@@ -25,8 +25,19 @@ import (
func BuildConsensus(seqs obiseq.BioSequenceSlice,
consensus_id string,
kmer_size int,
filter_out float64,
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
if seqs.Len() == 0 {
return nil, fmt.Errorf("no sequence provided")
}
if seqs.Len() == 1 {
seq := seqs[0].Copy()
seq.SetAttribute("obiconsensus_consensus", false)
return seq, nil
}
if save_graph {
if dirname == "" {
dirname = "."
@@ -104,7 +115,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
log.Debugf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
seq, err := graph.LongestConsensus(consensus_id)
seq, err := graph.LongestConsensus(consensus_id, filter_out)
sumCount := 0
@@ -112,7 +123,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
for _, s := range seqs {
sumCount += s.Count()
}
seq.SetAttribute("obiconsensus_consensus", true)
seq.SetAttribute("obiconsensus_weight", sumCount)
seq.SetAttribute("obiconsensus_seq_length", seq.Len())
seq.SetAttribute("obiconsensus_kmer_size", kmer_size)
@@ -136,6 +147,10 @@ func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func
stats := (*seqs)[i].StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA")
if stats == nil {
log.Panicf("Sample %s not found in sequence %d", sample, i)
}
if value, ok := stats[sample]; ok {
return float64(value)
}
@@ -292,16 +307,16 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
pack[degree] = v
clean, err = BuildConsensus(pack,
fmt.Sprintf("%s_consensus", v.Id()),
kmer_size,
kmer_size, CLILowCoverage(),
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
if err != nil {
log.Warning(err)
clean = (*graph.Vertices)[i]
clean = (*graph.Vertices)[i].Copy()
clean.SetAttribute("obiconsensus_consensus", false)
} else {
clean.SetAttribute("obiconsensus_consensus", true)
}
pack.Recycle(false)
} else {
@@ -318,8 +333,9 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
annotations := v.Annotations()
staton := obiseq.StatsOnSlotName(sample_key)
for k, v := range annotations {
if !clean.HasAttribute(k) {
if !clean.HasAttribute(k) && k != staton {
clean.SetAttribute(k, v)
}
}
@@ -334,6 +350,83 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
return denoised
}
func MinionClusterDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
sample_key string, kmer_size int) obiseq.BioSequenceSlice {
denoised := obiseq.MakeBioSequenceSlice()
seqs := (*obiseq.BioSequenceSlice)(graph.Vertices)
weight := SampleWeight(seqs, graph.Name, sample_key)
seqWeights := make([]float64, len(*seqs))
// Compute weights for each vertex as the sum of the weights of its neighbors
log.Info("")
log.Infof("Sample %s: Computing weights", graph.Name)
for i := range *seqs {
w := weight(i)
for _, j := range graph.Neighbors(i) {
w += weight(j)
}
seqWeights[i] = w
}
log.Infof("Sample %s: Done computing weights", graph.Name)
log.Infof("Sample %s: Clustering", graph.Name)
// Look for vertex not having a neighbor with a higher weight
for i := range *seqs {
v := (*seqs)[i]
head := true
neighbors := graph.Neighbors(i)
for _, j := range neighbors {
if seqWeights[i] < seqWeights[j] {
head = false
continue
}
}
if head {
pack := obiseq.MakeBioSequenceSlice(len(neighbors) + 1)
for k, j := range neighbors {
pack[k] = (*seqs)[j]
}
pack[len(neighbors)] = v
clean, err := BuildConsensus(pack,
fmt.Sprintf("%s_consensus", v.Id()),
kmer_size, CLILowCoverage(),
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
if err != nil {
log.Warning(err)
clean = (*graph.Vertices)[i].Copy()
clean.SetAttribute("obiconsensus_consensus", false)
}
pack.Recycle(false)
clean.SetAttribute(sample_key, graph.Name)
annotations := v.Annotations()
clean.SetCount(int(weight(i)))
staton := obiseq.StatsOnSlotName(sample_key)
for k, v := range annotations {
if !clean.HasAttribute(k) && k != staton {
clean.SetAttribute(k, v)
}
}
denoised = append(denoised, clean)
}
}
log.Infof("Sample %s: Done clustering", graph.Name)
return denoised
}
func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
dirname := CLIGraphFilesDirectory()
newIter := obiiter.MakeIBioSequence()
@@ -395,9 +488,17 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
false, 1, 0, 3)
}
denoised := MinionDenoise(graph,
CLISampleAttribute(),
CLIKmerSize())
var denoised obiseq.BioSequenceSlice
if CLICluterDenoise() {
denoised = MinionClusterDenoise(graph,
CLISampleAttribute(),
CLIKmerSize())
} else {
denoised = MinionDenoise(graph,
CLISampleAttribute(),
CLIKmerSize())
}
newIter.Push(obiiter.MakeBioSequenceBatch(source, sample_order, denoised))
@@ -411,9 +512,14 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
newIter.WaitAndClose()
}()
obiuniq.AddStatsOn(CLISampleAttribute())
// obiuniq.AddStatsOn("sample:obiconsensus_weight")
obiuniq.SetUniqueInMemory(false)
obiuniq.SetNoSingleton(CLINoSingleton())
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
res := newIter
if CLIUnique() {
obiuniq.AddStatsOn(CLISampleAttribute())
// obiuniq.AddStatsOn("sample:obiconsensus_weight")
obiuniq.SetUniqueInMemory(false)
obiuniq.SetNoSingleton(CLINoSingleton())
res = obiuniq.CLIUnique(newIter)
}
return res.Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
}

View File

@@ -8,8 +8,6 @@ import (
var _distStepMax = 1
var _sampleAttribute = "sample"
var _ratioMax = 1.0
var _clusterMode = false
var _onlyHead = false
@@ -20,6 +18,10 @@ var _NoSingleton = false
var _saveGraph = "__@@NOSAVE@@__"
var _saveRatio = "__@@NOSAVE@@__"
var _lowCoverage = 0.0
var _unique = false
// ObiminionOptionSet sets the options for obiminion.
//
// options: The options for configuring obiminion.
@@ -50,6 +52,19 @@ func ObiminionOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton,
options.Description("If set, sequences occurring a single time in the data set are discarded."))
options.BoolVar(&_clusterMode, "cluster", _clusterMode,
options.Alias("C"),
options.Description("Switch obiconsensus into its clustering mode."),
)
options.BoolVar(&_unique, "unique", _unique,
options.Alias("U"),
options.Description("If set, sequences are dereplicated on the output (obiuniq)."),
)
options.Float64Var(&_lowCoverage, "low-coverage", _lowCoverage,
options.Description("If the coverage of a sample is lower than this value, it will be discarded."),
)
}
// OptionSet sets up the options for the obiminion package.
@@ -129,4 +144,16 @@ func CLIKmerSize() int {
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
func CLINoSingleton() bool {
return _NoSingleton
}
}
func CLICluterDenoise() bool {
return _clusterMode
}
func CLIUnique() bool {
return _unique
}
func CLILowCoverage() float64 {
return _lowCoverage
}