mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Cleaning of obiminion
Former-commit-id: 75148afd70e5006cc6855bcddc86506b099761a1
This commit is contained in:
@ -9,6 +9,7 @@ import (
|
|||||||
"slices"
|
"slices"
|
||||||
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
"github.com/daichi-m/go18ds/sets/linkedhashset"
|
"github.com/daichi-m/go18ds/sets/linkedhashset"
|
||||||
"github.com/daichi-m/go18ds/stacks/arraystack"
|
"github.com/daichi-m/go18ds/stacks/arraystack"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
@ -294,7 +295,7 @@ func (g *DeBruijnGraph) LongestPath(max_length int) []uint64 {
|
|||||||
return path
|
return path
|
||||||
}
|
}
|
||||||
|
|
||||||
func (g *DeBruijnGraph) LongestConsensus(id string, max_length int) (*obiseq.BioSequence, error) {
|
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) {
|
||||||
//path := g.LongestPath(max_length)
|
//path := g.LongestPath(max_length)
|
||||||
path := g.HaviestPath()
|
path := g.HaviestPath()
|
||||||
s := g.DecodePath(path)
|
s := g.DecodePath(path)
|
||||||
@ -464,11 +465,12 @@ func (graph *DeBruijnGraph) Gml() string {
|
|||||||
|
|
||||||
for idx := range graph.graph {
|
for idx := range graph.graph {
|
||||||
srcid := nodeidx[idx]
|
srcid := nodeidx[idx]
|
||||||
|
weight := graph.Weight(idx)
|
||||||
n := graph.Nexts(idx)
|
n := graph.Nexts(idx)
|
||||||
for _, dst := range n {
|
for _, dst := range n {
|
||||||
dstid := nodeidx[dst]
|
dstid := nodeidx[dst]
|
||||||
|
weight := obiutils.Min(graph.Weight(dst), weight)
|
||||||
label := decode[dst&3]
|
label := decode[dst&3]
|
||||||
weight := graph.Weight(dst)
|
|
||||||
buffer.WriteString(
|
buffer.WriteString(
|
||||||
fmt.Sprintf(`edge [ source "%d"
|
fmt.Sprintf(`edge [ source "%d"
|
||||||
target "%d"
|
target "%d"
|
||||||
|
@ -17,9 +17,7 @@ import (
|
|||||||
|
|
||||||
func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||||
consensus_id string,
|
consensus_id string,
|
||||||
kmer_size int, quorum float64,
|
kmer_size int,
|
||||||
min_depth float64,
|
|
||||||
max_length int,
|
|
||||||
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
|
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
|
||||||
|
|
||||||
if save_graph {
|
if save_graph {
|
||||||
@ -37,7 +35,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s.fasta", consensus_id)))
|
fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s_consensus.fasta", consensus_id)))
|
||||||
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
defer fasta.Close()
|
defer fasta.Close()
|
||||||
@ -58,16 +56,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
|||||||
longest[i] = slices.Max(sa.CommonSuffix())
|
longest[i] = slices.Max(sa.CommonSuffix())
|
||||||
}
|
}
|
||||||
|
|
||||||
// o := obiutils.Order(sort.IntSlice(longest))
|
|
||||||
// i := int(float64(len(seqs)) * quorum)
|
|
||||||
|
|
||||||
// if i >= len(o) {
|
|
||||||
// i = len(o) - 1
|
|
||||||
// }
|
|
||||||
|
|
||||||
kmer_size = slices.Max(longest) + 1
|
kmer_size = slices.Max(longest) + 1
|
||||||
|
|
||||||
// kmer_size = longest[o[i]] + 1
|
|
||||||
log.Printf("estimated kmer size : %d", kmer_size)
|
log.Printf("estimated kmer size : %d", kmer_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,7 +79,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
|||||||
if save_graph {
|
if save_graph {
|
||||||
|
|
||||||
file, err := os.Create(path.Join(dirname,
|
file, err := os.Create(path.Join(dirname,
|
||||||
fmt.Sprintf("%s_raw_consensus.gml", consensus_id)))
|
fmt.Sprintf("%s_consensus.gml", consensus_id)))
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println(err)
|
fmt.Println(err)
|
||||||
@ -103,65 +92,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
|||||||
log.Printf("Graph size : %d\n", graph.Len())
|
log.Printf("Graph size : %d\n", graph.Len())
|
||||||
total_kmer := graph.Len()
|
total_kmer := graph.Len()
|
||||||
|
|
||||||
// threshold := 0
|
seq, err := graph.LongestConsensus(consensus_id)
|
||||||
|
|
||||||
// switch {
|
|
||||||
// case min_depth < 0:
|
|
||||||
// spectrum := graph.WeightSpectrum()
|
|
||||||
// cum := make(map[int]int)
|
|
||||||
|
|
||||||
// spectrum[1] = 0
|
|
||||||
// for i := 2; i < len(spectrum); i++ {
|
|
||||||
// spectrum[i] += spectrum[i-1]
|
|
||||||
// cum[spectrum[i]]++
|
|
||||||
// }
|
|
||||||
|
|
||||||
// max := 0
|
|
||||||
// kmax := 0
|
|
||||||
// for k, obs := range cum {
|
|
||||||
// if obs > max {
|
|
||||||
// max = obs
|
|
||||||
// kmax = k
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// for i, total := range spectrum {
|
|
||||||
// if total == kmax {
|
|
||||||
// threshold = i
|
|
||||||
// break
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// threshold /= 2
|
|
||||||
|
|
||||||
// if threshold < 1 {
|
|
||||||
// threshold = 1
|
|
||||||
// }
|
|
||||||
|
|
||||||
// log.Info("Estimated kmer_min_occur = ", threshold)
|
|
||||||
// case min_depth >= 1:
|
|
||||||
// threshold = int(min_depth)
|
|
||||||
// default:
|
|
||||||
// threshold = int(float64(len(seqs)) * min_depth)
|
|
||||||
// }
|
|
||||||
|
|
||||||
// graph.FilterMinWeight(threshold)
|
|
||||||
|
|
||||||
// log.Printf("Graph size : %d\n", graph.Len())
|
|
||||||
|
|
||||||
// if save_graph {
|
|
||||||
|
|
||||||
// file, err := os.Create(path.Join(dirname,
|
|
||||||
// fmt.Sprintf("%s_consensus.gml", consensus_id)))
|
|
||||||
|
|
||||||
// if err != nil {
|
|
||||||
// fmt.Println(err)
|
|
||||||
// } else {
|
|
||||||
// file.WriteString(graph.Gml())
|
|
||||||
// file.Close()
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
seq, err := graph.LongestConsensus(consensus_id, max_length)
|
|
||||||
|
|
||||||
sumCount := 0
|
sumCount := 0
|
||||||
|
|
||||||
@ -173,7 +104,6 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
|||||||
seq.SetCount(sumCount)
|
seq.SetCount(sumCount)
|
||||||
seq.SetAttribute("seq_length", seq.Len())
|
seq.SetAttribute("seq_length", seq.Len())
|
||||||
seq.SetAttribute("kmer_size", kmer_size)
|
seq.SetAttribute("kmer_size", kmer_size)
|
||||||
//seq.SetAttribute("kmer_min_occur", threshold)
|
|
||||||
seq.SetAttribute("kmer_max_occur", graph.MaxWeight())
|
seq.SetAttribute("kmer_max_occur", graph.MaxWeight())
|
||||||
seq.SetAttribute("filtered_graph_size", graph.Len())
|
seq.SetAttribute("filtered_graph_size", graph.Len())
|
||||||
seq.SetAttribute("full_graph_size", total_kmer)
|
seq.SetAttribute("full_graph_size", total_kmer)
|
||||||
@ -181,52 +111,6 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
|||||||
return seq, err
|
return seq, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// func BuildConsensusWithTimeout(seqs obiseq.BioSequenceSlice,
|
|
||||||
// kmer_size int, quorum float64,
|
|
||||||
// min_depth float64,
|
|
||||||
// save_graph bool, dirname string, timeout time.Duration) (*obiseq.BioSequence, error) {
|
|
||||||
|
|
||||||
// ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
|
||||||
// defer cancel()
|
|
||||||
|
|
||||||
// consensus := func() *obiseq.BioSequence {
|
|
||||||
// cons, err := BuildConsensus(seqs, kmer_size, quorum, min_depth, save_graph, dirname,)
|
|
||||||
// if err != nil {
|
|
||||||
// cons = nil
|
|
||||||
// }
|
|
||||||
|
|
||||||
// return cons
|
|
||||||
// }
|
|
||||||
|
|
||||||
// computation := func() <-chan *obiseq.BioSequence {
|
|
||||||
// result := make(chan *obiseq.BioSequence)
|
|
||||||
|
|
||||||
// go func() {
|
|
||||||
// select {
|
|
||||||
// case <-ctx.Done():
|
|
||||||
// result <- nil
|
|
||||||
// default:
|
|
||||||
// result <- consensus()
|
|
||||||
|
|
||||||
// }
|
|
||||||
// }()
|
|
||||||
|
|
||||||
// return result
|
|
||||||
// }
|
|
||||||
|
|
||||||
// calcResult := computation()
|
|
||||||
|
|
||||||
// select {
|
|
||||||
// case result := <-calcResult:
|
|
||||||
// if result == nil {
|
|
||||||
// return nil, fmt.Errorf("cannot compute consensus")
|
|
||||||
// }
|
|
||||||
// return result, nil
|
|
||||||
// case <-ctx.Done():
|
|
||||||
// return nil, fmt.Errorf("compute consensus timeout, exiting")
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
newIter := obiiter.MakeIBioSequence()
|
newIter := obiiter.MakeIBioSequence()
|
||||||
size := 10
|
size := 10
|
||||||
@ -266,9 +150,7 @@ func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
}
|
}
|
||||||
consensus, err := BuildConsensus(sequences,
|
consensus, err := BuildConsensus(sequences,
|
||||||
id,
|
id,
|
||||||
CLIKmerSize(), CLIThreshold(),
|
CLIKmerSize(),
|
||||||
CLIKmerDepth(),
|
|
||||||
CLIMaxConsensusLength(),
|
|
||||||
CLISaveGraphToFiles(),
|
CLISaveGraphToFiles(),
|
||||||
CLIGraphFilesDirectory(),
|
CLIGraphFilesDirectory(),
|
||||||
)
|
)
|
||||||
|
@ -7,9 +7,6 @@ import (
|
|||||||
|
|
||||||
var _saveGraph = "__@@NOSAVE@@__"
|
var _saveGraph = "__@@NOSAVE@@__"
|
||||||
var _kmerSize = -1
|
var _kmerSize = -1
|
||||||
var _threshold = 0.99
|
|
||||||
var _mindepth = -1.0
|
|
||||||
var _consensus_max_length = -1
|
|
||||||
|
|
||||||
func ObiconsensusOptionSet(options *getoptions.GetOpt) {
|
func ObiconsensusOptionSet(options *getoptions.GetOpt) {
|
||||||
|
|
||||||
@ -25,26 +22,6 @@ func ObiconsensusOptionSet(options *getoptions.GetOpt) {
|
|||||||
"Default value = -1, which means that the kmer size is estimated from the data"),
|
"Default value = -1, which means that the kmer size is estimated from the data"),
|
||||||
)
|
)
|
||||||
|
|
||||||
options.Float64Var(&_threshold, "threshold", _threshold,
|
|
||||||
options.ArgName("RATIO"),
|
|
||||||
options.Description("A threshold between O and 1 used to determine the optimal "+
|
|
||||||
"kmer size"),
|
|
||||||
)
|
|
||||||
|
|
||||||
options.Float64Var(&_mindepth, "min-depth", _mindepth,
|
|
||||||
options.ArgName("DEPTH"),
|
|
||||||
options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+
|
|
||||||
"reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+
|
|
||||||
"than 1, indicate the minimum count of occurrence for a kmer to be kept. "+
|
|
||||||
"Default value = -1, which means that the DEPTH is estimated from the data"),
|
|
||||||
)
|
|
||||||
|
|
||||||
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
|
|
||||||
options.ArgName("LENGTH"),
|
|
||||||
options.Description("Maximum length of the consensus sequence. "+
|
|
||||||
"Default value = -1, which means that no limit is applied"),
|
|
||||||
)
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func OptionSet(options *getoptions.GetOpt) {
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
@ -66,15 +43,3 @@ func CLIGraphFilesDirectory() string {
|
|||||||
func CLIKmerSize() int {
|
func CLIKmerSize() int {
|
||||||
return _kmerSize
|
return _kmerSize
|
||||||
}
|
}
|
||||||
|
|
||||||
func CLIKmerDepth() float64 {
|
|
||||||
return _mindepth
|
|
||||||
}
|
|
||||||
|
|
||||||
func CLIThreshold() float64 {
|
|
||||||
return _threshold
|
|
||||||
}
|
|
||||||
|
|
||||||
func CLIMaxConsensusLength() int {
|
|
||||||
return _consensus_max_length
|
|
||||||
}
|
|
||||||
|
@ -157,7 +157,7 @@ func BuildDiffSeqGraph(name, name_key string,
|
|||||||
}
|
}
|
||||||
|
|
||||||
func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||||
sample_key string, kmer_size int, max_length int, threshold float64, depth float64) obiseq.BioSequenceSlice {
|
sample_key string, kmer_size int) obiseq.BioSequenceSlice {
|
||||||
denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices))
|
denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices))
|
||||||
|
|
||||||
for i, v := range *graph.Vertices {
|
for i, v := range *graph.Vertices {
|
||||||
@ -173,8 +173,6 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
|||||||
clean, err = obiconsensus.BuildConsensus(pack,
|
clean, err = obiconsensus.BuildConsensus(pack,
|
||||||
fmt.Sprintf("%s_consensus", v.Id()),
|
fmt.Sprintf("%s_consensus", v.Id()),
|
||||||
kmer_size,
|
kmer_size,
|
||||||
threshold,
|
|
||||||
depth, max_length,
|
|
||||||
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
|
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -209,11 +207,6 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
samples := SeqBySamples(db, CLISampleAttribute())
|
samples := SeqBySamples(db, CLISampleAttribute())
|
||||||
db.Recycle(false)
|
db.Recycle(false)
|
||||||
|
|
||||||
log.Infof("Dataset composed of %d samples\n", len(samples))
|
|
||||||
if CLIMaxConsensusLength() > 0 {
|
|
||||||
log.Infof("Maximum consensus length: %d\n", CLIMaxConsensusLength())
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Infof("Dataset composed of %d samples\n", len(samples))
|
log.Infof("Dataset composed of %d samples\n", len(samples))
|
||||||
|
|
||||||
if CLISaveGraphToFiles() {
|
if CLISaveGraphToFiles() {
|
||||||
@ -266,10 +259,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
|
|
||||||
denoised := MinionDenoise(graph,
|
denoised := MinionDenoise(graph,
|
||||||
CLISampleAttribute(),
|
CLISampleAttribute(),
|
||||||
CLIKmerSize(),
|
CLIKmerSize())
|
||||||
CLIMaxConsensusLength(),
|
|
||||||
CLIThreshold(),
|
|
||||||
CLIKmerDepth())
|
|
||||||
|
|
||||||
newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised))
|
newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised))
|
||||||
|
|
||||||
|
@ -9,16 +9,11 @@ var _distStepMax = 1
|
|||||||
var _sampleAttribute = "sample"
|
var _sampleAttribute = "sample"
|
||||||
|
|
||||||
var _ratioMax = 1.0
|
var _ratioMax = 1.0
|
||||||
var _minEvalRate = 1000
|
|
||||||
|
|
||||||
var _clusterMode = false
|
var _clusterMode = false
|
||||||
var _onlyHead = false
|
var _onlyHead = false
|
||||||
|
|
||||||
var _kmerSize = -1
|
var _kmerSize = -1
|
||||||
var _threshold = 1.0
|
|
||||||
var _mindepth = -1.0
|
|
||||||
|
|
||||||
var _consensus_max_length = 1000
|
|
||||||
|
|
||||||
var _NoSingleton = false
|
var _NoSingleton = false
|
||||||
|
|
||||||
@ -37,9 +32,6 @@ func ObiminionOptionSet(options *getoptions.GetOpt) {
|
|||||||
options.Alias("d"),
|
options.Alias("d"),
|
||||||
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
|
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
|
||||||
|
|
||||||
options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate,
|
|
||||||
options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate."))
|
|
||||||
|
|
||||||
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
|
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
|
||||||
options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
|
options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
|
||||||
"The graph files follow the graphml format."),
|
"The graph files follow the graphml format."),
|
||||||
@ -55,30 +47,9 @@ func ObiminionOptionSet(options *getoptions.GetOpt) {
|
|||||||
"Default value = -1, which means that the kmer size is estimated from the data"),
|
"Default value = -1, which means that the kmer size is estimated from the data"),
|
||||||
)
|
)
|
||||||
|
|
||||||
options.Float64Var(&_threshold, "threshold", _threshold,
|
|
||||||
options.ArgName("RATIO"),
|
|
||||||
options.Description("A threshold between O and 1 used to determine the optimal "+
|
|
||||||
"kmer size"),
|
|
||||||
)
|
|
||||||
|
|
||||||
options.Float64Var(&_mindepth, "min-depth", _mindepth,
|
|
||||||
options.ArgName("DEPTH"),
|
|
||||||
options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+
|
|
||||||
"reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+
|
|
||||||
"than 1, indicate the minimum count of occurrence for a kmer to be kept. "+
|
|
||||||
"Default value = -1, which means that the DEPTH is estimated from the data"),
|
|
||||||
)
|
|
||||||
|
|
||||||
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
|
|
||||||
options.ArgName("LENGTH"),
|
|
||||||
options.Description("Maximum length of the consensus sequence. "+
|
|
||||||
"Default value = -1, which means that no limit is applied"),
|
|
||||||
)
|
|
||||||
|
|
||||||
options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton,
|
options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton,
|
||||||
options.Description("If set, sequences occurring a single time in the data set are discarded."))
|
options.Description("If set, sequences occurring a single time in the data set are discarded."))
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// OptionSet sets up the options for the obiminion package.
|
// OptionSet sets up the options for the obiminion package.
|
||||||
@ -111,12 +82,6 @@ func CLISampleAttribute() string {
|
|||||||
return _sampleAttribute
|
return _sampleAttribute
|
||||||
}
|
}
|
||||||
|
|
||||||
// > The function `CLIMinCountToEvalMutationRate()` returns the minimum number of reads that must be
|
|
||||||
// observed before the mutation rate can be evaluated
|
|
||||||
func CLIMinCountToEvalMutationRate() int {
|
|
||||||
return _minEvalRate
|
|
||||||
}
|
|
||||||
|
|
||||||
func ClusterMode() bool {
|
func ClusterMode() bool {
|
||||||
return _clusterMode
|
return _clusterMode
|
||||||
}
|
}
|
||||||
@ -158,18 +123,6 @@ func CLIKmerSize() int {
|
|||||||
return _kmerSize
|
return _kmerSize
|
||||||
}
|
}
|
||||||
|
|
||||||
func CLIKmerDepth() float64 {
|
|
||||||
return _mindepth
|
|
||||||
}
|
|
||||||
|
|
||||||
func CLIThreshold() float64 {
|
|
||||||
return _threshold
|
|
||||||
}
|
|
||||||
|
|
||||||
func CLIMaxConsensusLength() int {
|
|
||||||
return _consensus_max_length
|
|
||||||
}
|
|
||||||
|
|
||||||
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
|
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||||
//
|
//
|
||||||
// No parameters.
|
// No parameters.
|
||||||
|
Reference in New Issue
Block a user