diff --git a/pkg/obikmer/debruijn.go b/pkg/obikmer/debruijn.go index 1df3b6a..89d9428 100644 --- a/pkg/obikmer/debruijn.go +++ b/pkg/obikmer/debruijn.go @@ -9,6 +9,7 @@ import ( "slices" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "github.com/daichi-m/go18ds/sets/linkedhashset" "github.com/daichi-m/go18ds/stacks/arraystack" log "github.com/sirupsen/logrus" @@ -294,7 +295,7 @@ func (g *DeBruijnGraph) LongestPath(max_length int) []uint64 { return path } -func (g *DeBruijnGraph) LongestConsensus(id string, max_length int) (*obiseq.BioSequence, error) { +func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) { //path := g.LongestPath(max_length) path := g.HaviestPath() s := g.DecodePath(path) @@ -464,11 +465,12 @@ func (graph *DeBruijnGraph) Gml() string { for idx := range graph.graph { srcid := nodeidx[idx] + weight := graph.Weight(idx) n := graph.Nexts(idx) for _, dst := range n { dstid := nodeidx[dst] + weight := obiutils.Min(graph.Weight(dst), weight) label := decode[dst&3] - weight := graph.Weight(dst) buffer.WriteString( fmt.Sprintf(`edge [ source "%d" target "%d" diff --git a/pkg/obitools/obiconsensus/obiconsensus.go b/pkg/obitools/obiconsensus/obiconsensus.go index 3dcb628..af5f10b 100644 --- a/pkg/obitools/obiconsensus/obiconsensus.go +++ b/pkg/obitools/obiconsensus/obiconsensus.go @@ -17,9 +17,7 @@ import ( func BuildConsensus(seqs obiseq.BioSequenceSlice, consensus_id string, - kmer_size int, quorum float64, - min_depth float64, - max_length int, + kmer_size int, save_graph bool, dirname string) (*obiseq.BioSequence, error) { if save_graph { @@ -37,7 +35,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, } } - fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s.fasta", consensus_id))) + fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s_consensus.fasta", consensus_id))) if err == nil { defer fasta.Close() @@ -58,16 +56,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, longest[i] = slices.Max(sa.CommonSuffix()) } - // o := obiutils.Order(sort.IntSlice(longest)) - // i := int(float64(len(seqs)) * quorum) - - // if i >= len(o) { - // i = len(o) - 1 - // } - kmer_size = slices.Max(longest) + 1 - - // kmer_size = longest[o[i]] + 1 log.Printf("estimated kmer size : %d", kmer_size) } @@ -90,7 +79,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, if save_graph { file, err := os.Create(path.Join(dirname, - fmt.Sprintf("%s_raw_consensus.gml", consensus_id))) + fmt.Sprintf("%s_consensus.gml", consensus_id))) if err != nil { fmt.Println(err) @@ -103,65 +92,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, log.Printf("Graph size : %d\n", graph.Len()) total_kmer := graph.Len() - // threshold := 0 - - // switch { - // case min_depth < 0: - // spectrum := graph.WeightSpectrum() - // cum := make(map[int]int) - - // spectrum[1] = 0 - // for i := 2; i < len(spectrum); i++ { - // spectrum[i] += spectrum[i-1] - // cum[spectrum[i]]++ - // } - - // max := 0 - // kmax := 0 - // for k, obs := range cum { - // if obs > max { - // max = obs - // kmax = k - // } - // } - - // for i, total := range spectrum { - // if total == kmax { - // threshold = i - // break - // } - // } - // threshold /= 2 - - // if threshold < 1 { - // threshold = 1 - // } - - // log.Info("Estimated kmer_min_occur = ", threshold) - // case min_depth >= 1: - // threshold = int(min_depth) - // default: - // threshold = int(float64(len(seqs)) * min_depth) - // } - - // graph.FilterMinWeight(threshold) - - // log.Printf("Graph size : %d\n", graph.Len()) - - // if save_graph { - - // file, err := os.Create(path.Join(dirname, - // fmt.Sprintf("%s_consensus.gml", consensus_id))) - - // if err != nil { - // fmt.Println(err) - // } else { - // file.WriteString(graph.Gml()) - // file.Close() - // } - // } - - seq, err := graph.LongestConsensus(consensus_id, max_length) + seq, err := graph.LongestConsensus(consensus_id) sumCount := 0 @@ -173,7 +104,6 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, seq.SetCount(sumCount) seq.SetAttribute("seq_length", seq.Len()) seq.SetAttribute("kmer_size", kmer_size) - //seq.SetAttribute("kmer_min_occur", threshold) seq.SetAttribute("kmer_max_occur", graph.MaxWeight()) seq.SetAttribute("filtered_graph_size", graph.Len()) seq.SetAttribute("full_graph_size", total_kmer) @@ -181,52 +111,6 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, return seq, err } -// func BuildConsensusWithTimeout(seqs obiseq.BioSequenceSlice, -// kmer_size int, quorum float64, -// min_depth float64, -// save_graph bool, dirname string, timeout time.Duration) (*obiseq.BioSequence, error) { - -// ctx, cancel := context.WithTimeout(context.Background(), timeout) -// defer cancel() - -// consensus := func() *obiseq.BioSequence { -// cons, err := BuildConsensus(seqs, kmer_size, quorum, min_depth, save_graph, dirname,) -// if err != nil { -// cons = nil -// } - -// return cons -// } - -// computation := func() <-chan *obiseq.BioSequence { -// result := make(chan *obiseq.BioSequence) - -// go func() { -// select { -// case <-ctx.Done(): -// result <- nil -// default: -// result <- consensus() - -// } -// }() - -// return result -// } - -// calcResult := computation() - -// select { -// case result := <-calcResult: -// if result == nil { -// return nil, fmt.Errorf("cannot compute consensus") -// } -// return result, nil -// case <-ctx.Done(): -// return nil, fmt.Errorf("compute consensus timeout, exiting") -// } -// } - func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence { newIter := obiiter.MakeIBioSequence() size := 10 @@ -266,9 +150,7 @@ func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence { } consensus, err := BuildConsensus(sequences, id, - CLIKmerSize(), CLIThreshold(), - CLIKmerDepth(), - CLIMaxConsensusLength(), + CLIKmerSize(), CLISaveGraphToFiles(), CLIGraphFilesDirectory(), ) diff --git a/pkg/obitools/obiconsensus/options.go b/pkg/obitools/obiconsensus/options.go index 83c8d03..dcd5ec7 100644 --- a/pkg/obitools/obiconsensus/options.go +++ b/pkg/obitools/obiconsensus/options.go @@ -7,9 +7,6 @@ import ( var _saveGraph = "__@@NOSAVE@@__" var _kmerSize = -1 -var _threshold = 0.99 -var _mindepth = -1.0 -var _consensus_max_length = -1 func ObiconsensusOptionSet(options *getoptions.GetOpt) { @@ -25,26 +22,6 @@ func ObiconsensusOptionSet(options *getoptions.GetOpt) { "Default value = -1, which means that the kmer size is estimated from the data"), ) - options.Float64Var(&_threshold, "threshold", _threshold, - options.ArgName("RATIO"), - options.Description("A threshold between O and 1 used to determine the optimal "+ - "kmer size"), - ) - - options.Float64Var(&_mindepth, "min-depth", _mindepth, - options.ArgName("DEPTH"), - options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+ - "reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+ - "than 1, indicate the minimum count of occurrence for a kmer to be kept. "+ - "Default value = -1, which means that the DEPTH is estimated from the data"), - ) - - options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length, - options.ArgName("LENGTH"), - options.Description("Maximum length of the consensus sequence. "+ - "Default value = -1, which means that no limit is applied"), - ) - } func OptionSet(options *getoptions.GetOpt) { @@ -66,15 +43,3 @@ func CLIGraphFilesDirectory() string { func CLIKmerSize() int { return _kmerSize } - -func CLIKmerDepth() float64 { - return _mindepth -} - -func CLIThreshold() float64 { - return _threshold -} - -func CLIMaxConsensusLength() int { - return _consensus_max_length -} diff --git a/pkg/obitools/obiminion/obiminion.go b/pkg/obitools/obiminion/obiminion.go index ed6d6b7..4d939b1 100644 --- a/pkg/obitools/obiminion/obiminion.go +++ b/pkg/obitools/obiminion/obiminion.go @@ -157,7 +157,7 @@ func BuildDiffSeqGraph(name, name_key string, } func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation], - sample_key string, kmer_size int, max_length int, threshold float64, depth float64) obiseq.BioSequenceSlice { + sample_key string, kmer_size int) obiseq.BioSequenceSlice { denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices)) for i, v := range *graph.Vertices { @@ -173,8 +173,6 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation], clean, err = obiconsensus.BuildConsensus(pack, fmt.Sprintf("%s_consensus", v.Id()), kmer_size, - threshold, - depth, max_length, CLISaveGraphToFiles(), CLIGraphFilesDirectory()) if err != nil { @@ -209,11 +207,6 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence { samples := SeqBySamples(db, CLISampleAttribute()) db.Recycle(false) - log.Infof("Dataset composed of %d samples\n", len(samples)) - if CLIMaxConsensusLength() > 0 { - log.Infof("Maximum consensus length: %d\n", CLIMaxConsensusLength()) - } - log.Infof("Dataset composed of %d samples\n", len(samples)) if CLISaveGraphToFiles() { @@ -266,10 +259,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence { denoised := MinionDenoise(graph, CLISampleAttribute(), - CLIKmerSize(), - CLIMaxConsensusLength(), - CLIThreshold(), - CLIKmerDepth()) + CLIKmerSize()) newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised)) diff --git a/pkg/obitools/obiminion/options.go b/pkg/obitools/obiminion/options.go index 68f7681..d0644fa 100644 --- a/pkg/obitools/obiminion/options.go +++ b/pkg/obitools/obiminion/options.go @@ -9,16 +9,11 @@ var _distStepMax = 1 var _sampleAttribute = "sample" var _ratioMax = 1.0 -var _minEvalRate = 1000 var _clusterMode = false var _onlyHead = false var _kmerSize = -1 -var _threshold = 1.0 -var _mindepth = -1.0 - -var _consensus_max_length = 1000 var _NoSingleton = false @@ -37,9 +32,6 @@ func ObiminionOptionSet(options *getoptions.GetOpt) { options.Alias("d"), options.Description("Maximum numbers of differences between two variant sequences (default: %d).")) - options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate, - options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate.")) - options.StringVar(&_saveGraph, "save-graph", _saveGraph, options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+ "The graph files follow the graphml format."), @@ -55,30 +47,9 @@ func ObiminionOptionSet(options *getoptions.GetOpt) { "Default value = -1, which means that the kmer size is estimated from the data"), ) - options.Float64Var(&_threshold, "threshold", _threshold, - options.ArgName("RATIO"), - options.Description("A threshold between O and 1 used to determine the optimal "+ - "kmer size"), - ) - - options.Float64Var(&_mindepth, "min-depth", _mindepth, - options.ArgName("DEPTH"), - options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+ - "reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+ - "than 1, indicate the minimum count of occurrence for a kmer to be kept. "+ - "Default value = -1, which means that the DEPTH is estimated from the data"), - ) - - options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length, - options.ArgName("LENGTH"), - options.Description("Maximum length of the consensus sequence. "+ - "Default value = -1, which means that no limit is applied"), - ) - options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton, options.Description("If set, sequences occurring a single time in the data set are discarded.")) - } // OptionSet sets up the options for the obiminion package. @@ -111,12 +82,6 @@ func CLISampleAttribute() string { return _sampleAttribute } -// > The function `CLIMinCountToEvalMutationRate()` returns the minimum number of reads that must be -// observed before the mutation rate can be evaluated -func CLIMinCountToEvalMutationRate() int { - return _minEvalRate -} - func ClusterMode() bool { return _clusterMode } @@ -158,18 +123,6 @@ func CLIKmerSize() int { return _kmerSize } -func CLIKmerDepth() float64 { - return _mindepth -} - -func CLIThreshold() float64 { - return _threshold -} - -func CLIMaxConsensusLength() int { - return _consensus_max_length -} - // CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded. // // No parameters.