From 4bf041be62f4fbcae03d6b2575fb72ec8f82a3fe Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 28 Mar 2023 21:23:27 +0700 Subject: [PATCH] Add some option to the obiconsensus command Former-commit-id: cf30a404a4943e4527106c977b01712ef454e028 --- cmd/obitools/obiconsensus/main.go | 4 +- pkg/obitools/obiconsensus/obiconsensus.go | 136 ++++++++++++++-------- pkg/obitools/obiconsensus/options.go | 56 +++++++++ 3 files changed, 146 insertions(+), 50 deletions(-) diff --git a/cmd/obitools/obiconsensus/main.go b/cmd/obitools/obiconsensus/main.go index 9da6155..0f964ff 100644 --- a/cmd/obitools/obiconsensus/main.go +++ b/cmd/obitools/obiconsensus/main.go @@ -13,7 +13,7 @@ import ( ) func main() { - optionParser := obioptions.GenerateOptionParser(obiconvert.OptionSet) + optionParser := obioptions.GenerateOptionParser(obiconsensus.OptionSet) _, args := optionParser(os.Args) obiconvert.SetFullFileBatch() @@ -25,7 +25,7 @@ func main() { os.Exit(1) } - consensus := obiconsensus.Consensus(fs, 0.95) + consensus := obiconsensus.Consensus(fs) obiconvert.CLIWriteBioSequences(consensus, true) obiiter.WaitForLastPipe() diff --git a/pkg/obitools/obiconsensus/obiconsensus.go b/pkg/obitools/obiconsensus/obiconsensus.go index c000fc0..b1d0b1e 100644 --- a/pkg/obitools/obiconsensus/obiconsensus.go +++ b/pkg/obitools/obiconsensus/obiconsensus.go @@ -1,6 +1,9 @@ package obiconsensus import ( + "fmt" + "os" + "path" "sort" log "github.com/sirupsen/logrus" @@ -12,25 +15,30 @@ import ( "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" ) -func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSequence, error) { +func BuildConsensus(seqs obiseq.BioSequenceSlice, + kmer_size int, quorum float64, + min_depth float64, + save_graph bool, dirname string) (*obiseq.BioSequence, error) { log.Printf("Number of reads : %d\n", len(seqs)) - longest := make([]int, len(seqs)) + if kmer_size < 0 { + longest := make([]int, len(seqs)) - for i := range seqs { - s := seqs[i : i+1] - sa := obisuffix.BuildSuffixArray(&s) - longest[i] = obiutils.MaxSlice(sa.CommonSuffix()) + for i := range seqs { + s := seqs[i : i+1] + sa := obisuffix.BuildSuffixArray(&s) + longest[i] = obiutils.MaxSlice(sa.CommonSuffix()) + } + + o := obiutils.Order(sort.IntSlice(longest)) + i := int(float64(len(seqs)) * quorum) + + kmer_size = longest[o[i]] + 1 + log.Printf("estimated kmer size : %d", kmer_size) } - o := obiutils.Order(sort.IntSlice(longest)) - i := int(float64(len(seqs)) * quorum) - - kmersize := longest[o[i]] + 1 - log.Printf("estimated kmer size : %d", kmersize) - - graph := obikmer.MakeDeBruijnGraph(kmersize) + graph := obikmer.MakeDeBruijnGraph(kmer_size) for _, s := range seqs { graph.Push(s) @@ -38,50 +46,64 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSe log.Printf("Graph size : %d\n", graph.Len()) total_kmer := graph.Len() - spectrum := graph.LinkSpectrum() - cum := make(map[int]int) - - spectrum[1] = 0 - for i := 2; i < len(spectrum); i++ { - spectrum[i] += spectrum[i-1] - cum[spectrum[i]]++ - } - - max := 0 - kmax := 0 - for k, obs := range cum { - if obs > max { - max = obs - kmax = k - } - } threshold := 0 - for i, total := range spectrum { - if total == kmax { - threshold = i - break + + switch { + case min_depth < 0: + spectrum := graph.LinkSpectrum() + cum := make(map[int]int) + + spectrum[1] = 0 + for i := 2; i < len(spectrum); i++ { + spectrum[i] += spectrum[i-1] + cum[spectrum[i]]++ } + + max := 0 + kmax := 0 + for k, obs := range cum { + if obs > max { + max = obs + kmax = k + } + } + + for i, total := range spectrum { + if total == kmax { + threshold = i + break + } + } + threshold /= 2 + case min_depth >= 1: + threshold = int(min_depth) + default: + threshold = int(float64(len(seqs)) * min_depth) } - threshold /= 2 + graph.FilterMin(threshold) + log.Printf("Graph size : %d\n", graph.Len()) - // file, err := os.Create( - // fmt.Sprintf("%s.gml", seqs[0].Source())) - - // if err != nil { - // fmt.Println(err) - // } else { - // file.WriteString(graph.GML()) - // file.Close() - // } + if save_graph { + + file, err := os.Create(path.Join(dirname, + fmt.Sprintf("%s.gml", seqs[0].Source()))) + + if err != nil { + fmt.Println(err) + } else { + file.WriteString(graph.Gml()) + file.Close() + } + } seq, err := graph.LongestConsensus(seqs[0].Source()) seq.SetCount(len(seqs)) seq.SetAttribute("seq_length", seq.Len()) - seq.SetAttribute("kmer_size", kmersize) + seq.SetAttribute("kmer_size", kmer_size) seq.SetAttribute("kmer_min_occur", threshold) seq.SetAttribute("kmer_max_occur", graph.MaxLink()) seq.SetAttribute("filtered_graph_size", graph.Len()) @@ -90,10 +112,23 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSe return seq, err } -func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequence { +func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence { newIter := obiiter.MakeIBioSequence() size := 10 + if CLISaveGraphToFiles() { + dirname := CLIGraphFilesDirectory() + if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() { + // path does not exist or is not directory + os.RemoveAll(dirname) + err := os.Mkdir(dirname, 0755) + + if err != nil { + log.Panicf("Cannot create directory %s for saving graphs", dirname) + } + } + } + newIter.Add(1) go func() { @@ -107,7 +142,12 @@ func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequen for iterator.Next() { seqs := iterator.Get() - consensus, err := BuildConsensus(seqs.Slice(), quorum) + + consensus, err := BuildConsensus(seqs.Slice(), + CLIKmerSize(), CLIThreshold(), + CLIKmerDepth(), + CLISaveGraphToFiles(), CLIGraphFilesDirectory(), + ) if err == nil { buffer = append(buffer, consensus) @@ -118,7 +158,7 @@ func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequen order++ buffer = obiseq.MakeBioSequenceSlice() } - seqs.Recycle() + seqs.Recycle(true) } if len(buffer) > 0 { diff --git a/pkg/obitools/obiconsensus/options.go b/pkg/obitools/obiconsensus/options.go index 3a418cc..6223c11 100644 --- a/pkg/obitools/obiconsensus/options.go +++ b/pkg/obitools/obiconsensus/options.go @@ -5,9 +5,65 @@ import ( "github.com/DavidGamba/go-getoptions" ) +var _saveGraph = "__@@NOSAVE@@__" +var _kmerSize = -1 +var _threshold = 0.99 +var _mindepth = -1.0 + +func ObiconsensusOptionSet(options *getoptions.GetOpt) { + + options.StringVar(&_saveGraph, "save-graph", _saveGraph, + options.Description("Creates a directory containing the set of De Bruijn graphs used by "+ + "the obiconsensus algorithm. "+ + "The graph files follow the graphml format."), + ) + + options.IntVar(&_kmerSize, "kmer-size", _kmerSize, + options.ArgName("SIZE"), + options.Description("The size of the kmer used to build the consensus. "+ + "Default value = -1, which means that the kmer size is estimated from the data"), + ) + + options.Float64Var(&_threshold, "threshold", _threshold, + options.ArgName("RATIO"), + options.Description("A threshold between O and 1 used to determine the optimal "+ + "kmer size"), + ) + + options.Float64Var(&_mindepth, "min-depth", _mindepth, + options.ArgName("DEPTH"), + options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+ + "reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+ + "than 1, indicate the minimum count of occurrence for a kmer to be kept. "+ + "Default value = -1, which means that the DEPTH is estimated from the data"), + ) + +} func OptionSet(options *getoptions.GetOpt) { obiconvert.InputOptionSet(options) obiconvert.OutputOptionSet(options) + ObiconsensusOptionSet(options) } +// Returns true it the obliclean graphs must be saved +func CLISaveGraphToFiles() bool { + return _saveGraph != "__@@NOSAVE@@__" +} + +// It returns the directory where the graph files are saved +func CLIGraphFilesDirectory() string { + return _saveGraph +} + +func CLIKmerSize() int { + return _kmerSize +} + +func CLIKmerDepth() float64 { + return _mindepth +} + +func CLIThreshold() float64 { + return _threshold +} \ No newline at end of file