diff --git a/cmd/obitools/obiconsensus/main.go b/cmd/obitools/obiconsensus/main.go index 1a8dc28..2d30d5e 100644 --- a/cmd/obitools/obiconsensus/main.go +++ b/cmd/obitools/obiconsensus/main.go @@ -16,7 +16,6 @@ func main() { optionParser := obioptions.GenerateOptionParser(obiconsensus.OptionSet) _, args := optionParser(os.Args) - obiconvert.SetFullFileBatch() fs, err := obiconvert.CLIReadBioSequences(args...) @@ -25,8 +24,9 @@ func main() { os.Exit(1) } - consensus := obiconsensus.Consensus(fs) - obiconvert.CLIWriteBioSequences(consensus, true) + cleaned := obiconsensus.CLIOBIMinion(fs) + + obiconvert.CLIWriteBioSequences(cleaned, true) obiiter.WaitForLastPipe() diff --git a/cmd/obitools/obiminion/main.go b/cmd/obitools/obiminion/main.go deleted file mode 100644 index ad74cda..0000000 --- a/cmd/obitools/obiminion/main.go +++ /dev/null @@ -1,33 +0,0 @@ -package main - -import ( - "os" - - log "github.com/sirupsen/logrus" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiminion" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" -) - -func main() { - optionParser := obioptions.GenerateOptionParser(obiminion.OptionSet) - - _, args := optionParser(os.Args) - - fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } - - cleaned := obiminion.CLIOBIMinion(fs) - - obiconvert.CLIWriteBioSequences(cleaned, true) - - obiiter.WaitForLastPipe() - -} diff --git a/pkg/obikmer/debruijn.go b/pkg/obikmer/debruijn.go index 89d9428..9ef8c6b 100644 --- a/pkg/obikmer/debruijn.go +++ b/pkg/obikmer/debruijn.go @@ -6,12 +6,11 @@ import ( "fmt" "math" "math/bits" + "os" "slices" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" - "github.com/daichi-m/go18ds/sets/linkedhashset" - "github.com/daichi-m/go18ds/stacks/arraystack" log "github.com/sirupsen/logrus" ) @@ -397,6 +396,10 @@ func (graph *DeBruijnGraph) append(sequence []byte, current uint64, weight int) } } +// Push appends a BioSequence to the DeBruijnGraph. +// +// Parameters: +// - sequence: a pointer to a BioSequence containing the sequence to be added. func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) { s := sequence.Sequence() // Get the sequence as a byte slice w := sequence.Count() // Get the weight of the sequence @@ -493,45 +496,19 @@ func (graph *DeBruijnGraph) Gml() string { } -// fonction tri_topologique(G, V): -// T <- une liste vide pour stocker l'ordre topologique -// S <- une pile vide pour stocker les nœuds sans prédécesseurs -// pour chaque nœud v dans V: -// si Pred(v) est vide: -// empiler S avec v -// tant que S n'est pas vide: -// nœud <- dépiler S -// ajouter nœud à T -// pour chaque successeur s de nœud: -// supprimer l'arc (nœud, s) de G -// si Pred(s) est vide: -// empiler S avec s -// si G contient encore des arcs: -// renvoyer une erreur (le graphe contient au moins un cycle) -// sinon: -// renvoyer T (l'ordre topologique) +// WriteGml writes the DeBruijnGraph to a GML file. +// +// filename: the name of the file to write the GML representation to. +// error: an error if any occurs during the file creation or writing process. +func (graph *DeBruijnGraph) WriteGml(filename string) error { -// A topological sort of the graph. -func (g *DeBruijnGraph) PartialOrder() *linkedhashset.Set[uint64] { - S := arraystack.New[uint64]() - T := linkedhashset.New[uint64]() - - for v := range g.graph { - if len(g.Previouses(v)) == 0 { - S.Push(v) - } + f, err := os.Create(filename) + if err != nil { + return err } - - for !S.Empty() { - v, _ := S.Pop() - T.Add(v) - for _, w := range g.Nexts(v) { - if T.Contains(g.Previouses(w)...) { - S.Push(w) - } - } - } - return T + defer f.Close() + _, err = f.WriteString(graph.Gml()) + return err } // Calculating the hamming distance between two k-mers. diff --git a/pkg/obitools/obiconsensus/obiconsensus.go b/pkg/obitools/obiconsensus/obiconsensus.go index af5f10b..54f09f7 100644 --- a/pkg/obitools/obiconsensus/obiconsensus.go +++ b/pkg/obitools/obiconsensus/obiconsensus.go @@ -5,14 +5,21 @@ import ( "os" "path" "slices" + "sync" - log "github.com/sirupsen/logrus" - + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obigraph" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obisuffix" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiannotate" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiuniq" + "github.com/schollz/progressbar/v3" + log "github.com/sirupsen/logrus" ) func BuildConsensus(seqs obiseq.BioSequenceSlice, @@ -111,12 +118,198 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, return seq, err } -func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence { +// SampleWeight calculates the weight of a sample based on the statistics of a sequence. +// +// Parameters: +// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice) +// - sample: the sample for which the weight is calculated (string) +// - sample_key: the key used to access the sample's statistics (string) +// Return type: a function that takes an integer index and returns the weight of the sample at that index (func(int) int) +func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func(int) float64 { + + f := func(i int) float64 { + + stats := (*seqs)[i].StatsOn(sample_key, "NA") + + if value, ok := stats[sample]; ok { + return float64(value) + } + + return 0 + } + + return f +} + +// SeqBySamples sorts the sequences by samples. +// +// Parameters: +// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice) +// - sample_key: a string representing the sample key (string) +// +// Return type: +// - map[string]BioSequenceSlice: a map indexed by sample names, each containing a slice of BioSequence objects (map[string]BioSequenceSlice) +func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*obiseq.BioSequenceSlice { + + samples := make(map[string]*obiseq.BioSequenceSlice) + + for _, s := range seqs { + if s.HasStatsOn(sample_key) { + stats := s.StatsOn(sample_key, "NA") + for k := range stats { + if seqset, ok := samples[k]; ok { + *seqset = append(*seqset, s) + samples[k] = seqset + } else { + samples[k] = &obiseq.BioSequenceSlice{s} + } + } + } else { + if k, ok := s.GetStringAttribute(sample_key); ok { + if seqset, ok := samples[k]; ok { + *seqset = append(*seqset, s) + samples[k] = seqset + } else { + samples[k] = &obiseq.BioSequenceSlice{s} + } + } + } + } + + return samples + +} + +type Mutation struct { + Position int + SeqA byte + SeqB byte + Ratio float64 +} + +func BuildDiffSeqGraph(name, name_key string, + seqs *obiseq.BioSequenceSlice, + distmax, nworkers int) *obigraph.Graph[*obiseq.BioSequence, Mutation] { + graph := obigraph.NewGraphBuffer[*obiseq.BioSequence, Mutation](name, (*[]*obiseq.BioSequence)(seqs)) + iseq := make(chan int) + defer graph.Close() + + ls := len(*seqs) + + sw := SampleWeight(seqs, name, name_key) + graph.Graph.VertexWeight = sw + + waiting := sync.WaitGroup{} + waiting.Add(nworkers) + + bar := (*progressbar.ProgressBar)(nil) + if obiconvert.CLIProgressBar() { + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription(fmt.Sprintf("[Build graph] on %s", name)), + ) + + bar = progressbar.NewOptions(len(*seqs), pbopt...) + } + + computeEdges := func() { + defer waiting.Done() + for i := range iseq { + s1 := (*seqs)[i] + for j := i + 1; j < ls; j++ { + s2 := (*seqs)[j] + ratio := sw(i) / sw(j) + ok, pos, a1, a2 := obialign.D1Or0(s1, s2) + if ok >= 0 { + graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio}) + } else if distmax > 1 { + lcs, lali := obialign.FastLCSScore(s1, s2, distmax, nil) + dist := lali - lcs + if lcs > 0 && dist <= distmax { + // log.Infof("Seq %s and %s: LCSScore: %d, dist: %d\n", s1.Id(), s2.Id(), lcs, dist) + graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio}) + } + } + } + + if bar != nil { + bar.Add(1) + } + } + } + + for i := 0; i < nworkers; i++ { + go computeEdges() + } + + for i := 0; i < ls; i++ { + iseq <- i + } + close(iseq) + + waiting.Wait() + return graph.Graph +} + +func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation], + sample_key string, kmer_size int) obiseq.BioSequenceSlice { + denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices)) + + for i, v := range *graph.Vertices { + var err error + var clean *obiseq.BioSequence + degree := graph.Degree(i) + if degree > 4 { + pack := obiseq.MakeBioSequenceSlice(degree + 1) + for k, j := range graph.Neighbors(i) { + pack[k] = (*graph.Vertices)[j] + } + pack[degree] = v + clean, err = BuildConsensus(pack, + fmt.Sprintf("%s_consensus", v.Id()), + kmer_size, + CLISaveGraphToFiles(), CLIGraphFilesDirectory()) + + if err != nil { + log.Warning(err) + clean = (*graph.Vertices)[i] + clean.SetAttribute("obiminion_consensus", false) + } else { + clean.SetAttribute("obiminion_consensus", true) + } + pack.Recycle(false) + } else { + clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition()) + clean.SetAttribute("obiminion_consensus", false) + } + + clean.SetCount(int(graph.VertexWeight(i))) + clean.SetAttribute(sample_key, graph.Name) + + denoised[i] = clean + } + + return denoised +} +func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence { + dirname := CLIGraphFilesDirectory() newIter := obiiter.MakeIBioSequence() - size := 10 + + db := itertator.Load() + + log.Infof("Sequence dataset of %d sequeences loaded\n", len(db)) + + samples := SeqBySamples(db, CLISampleAttribute()) + db.Recycle(false) + + log.Infof("Dataset composed of %d samples\n", len(samples)) if CLISaveGraphToFiles() { - dirname := CLIGraphFilesDirectory() if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() { // path does not exist or is not directory os.RemoveAll(dirname) @@ -128,52 +321,60 @@ func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence { } } + bar := (*progressbar.ProgressBar)(nil) + if obiconvert.CLIProgressBar() { + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[Filter graph on abundance ratio]"), + ) + + bar = progressbar.NewOptions(len(samples), pbopt...) + } + newIter.Add(1) + go func() { + sample_order := 0 + for sample, seqs := range samples { + graph := BuildDiffSeqGraph(sample, + CLISampleAttribute(), + seqs, + CLIDistStepMax(), + obioptions.CLIParallelWorkers()) + if bar != nil { + bar.Add(1) + } + + if CLISaveGraphToFiles() { + graph.WriteGmlFile(fmt.Sprintf("%s/%s.gml", + CLIGraphFilesDirectory(), + sample), + false, 1, 0, 3) + } + + denoised := MinionDenoise(graph, + CLISampleAttribute(), + CLIKmerSize()) + + newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised)) + + sample_order++ + } + + newIter.Done() + }() + go func() { newIter.WaitAndClose() }() - go func() { - order := 0 - iterator = iterator.SortBatches() - buffer := obiseq.MakeBioSequenceSlice() - - for iterator.Next() { - seqs := iterator.Get() - - sequences := seqs.Slice() - - id := sequences[0].Source() - if id == "" { - id = sequences[0].Id() - } - consensus, err := BuildConsensus(sequences, - id, - CLIKmerSize(), - CLISaveGraphToFiles(), - CLIGraphFilesDirectory(), - ) - - if err == nil { - buffer = append(buffer, consensus) - } - - if len(buffer) == size { - newIter.Push(obiiter.MakeBioSequenceBatch(order, buffer)) - order++ - buffer = obiseq.MakeBioSequenceSlice() - } - seqs.Recycle(true) - } - - if len(buffer) > 0 { - newIter.Push(obiiter.MakeBioSequenceBatch(order, buffer)) - } - - newIter.Done() - - }() - - return newIter + obiuniq.AddStatsOn(CLISampleAttribute()) + obiuniq.SetUniqueInMemory(false) + obiuniq.SetNoSingleton(CLINoSingleton()) + return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false)) } diff --git a/pkg/obitools/obiconsensus/options.go b/pkg/obitools/obiconsensus/options.go index dcd5ec7..4efde43 100644 --- a/pkg/obitools/obiconsensus/options.go +++ b/pkg/obitools/obiconsensus/options.go @@ -5,29 +5,90 @@ import ( "github.com/DavidGamba/go-getoptions" ) -var _saveGraph = "__@@NOSAVE@@__" +var _distStepMax = 1 +var _sampleAttribute = "sample" + +var _ratioMax = 1.0 + +var _clusterMode = false +var _onlyHead = false + var _kmerSize = -1 -func ObiconsensusOptionSet(options *getoptions.GetOpt) { +var _NoSingleton = false + +var _saveGraph = "__@@NOSAVE@@__" +var _saveRatio = "__@@NOSAVE@@__" + +// ObiminionOptionSet sets the options for obiminion. +// +// options: The options for configuring obiminion. +func ObiminionOptionSet(options *getoptions.GetOpt) { + options.StringVar(&_sampleAttribute, "sample", _sampleAttribute, + options.Alias("s"), + options.Description("Attribute containing sample descriptions (default %s).")) + + options.IntVar(&_distStepMax, "distance", _distStepMax, + options.Alias("d"), + options.Description("Maximum numbers of differences between two variant sequences (default: %d).")) options.StringVar(&_saveGraph, "save-graph", _saveGraph, - options.Description("Creates a directory containing the set of De Bruijn graphs used by "+ - "the obiconsensus algorithm. "+ + options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+ "The graph files follow the graphml format."), ) + options.StringVar(&_saveRatio, "save-ratio", _saveRatio, + options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+ + "The ratio file follows the csv format."), + ) options.IntVar(&_kmerSize, "kmer-size", _kmerSize, options.ArgName("SIZE"), options.Description("The size of the kmer used to build the consensus. "+ "Default value = -1, which means that the kmer size is estimated from the data"), ) + options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton, + options.Description("If set, sequences occurring a single time in the data set are discarded.")) + } +// OptionSet sets up the options for the obiminion package. +// +// It takes a pointer to a getoptions.GetOpt object as a parameter. +// It does not return any value. func OptionSet(options *getoptions.GetOpt) { obiconvert.InputOptionSet(options) obiconvert.OutputOptionSet(options) - ObiconsensusOptionSet(options) + ObiminionOptionSet(options) +} + +// CLIDistStepMax returns the maximum distance between two sequences. +// +// The value of the distance is set by the user with the `-d` flag. +// +// No parameters. +// Returns an integer. +func CLIDistStepMax() int { + return _distStepMax +} + +// CLISampleAttribute returns the name of the attribute used to store sample name. +// +// The value of the sample attribute is set by the user with the `-s` flag. +// +// No parameters. +// Returns a string. +func CLISampleAttribute() string { + return _sampleAttribute +} + +func ClusterMode() bool { + return _clusterMode +} + +// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program +func OnlyHead() bool { + return _onlyHead } // Returns true it the obliclean graphs must be saved @@ -40,6 +101,32 @@ func CLIGraphFilesDirectory() string { return _saveGraph } +// Returns true it the table of ratio must be saved +func IsSaveRatioTable() bool { + return _saveRatio != "__@@NOSAVE@@__" +} + +// It returns the filename of the file that stores the ratio table +func RatioTableFilename() string { + return _saveRatio +} + +// CLIKmerSize returns the value of the kmer size to use for building the consensus. +// +// The value of the kmer size is set by the user with the `-k` flag. +// The value -1 means that the kmer size is estimated as the minimum value that +// insure that no kmer are present more than one time in a sequence. +// +// No parameters. +// Returns an integer value. func CLIKmerSize() int { return _kmerSize } + +// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded. +// +// No parameters. +// Returns a boolean value indicating whether or not singleton sequences should be discarded. +func CLINoSingleton() bool { + return _NoSingleton +} \ No newline at end of file diff --git a/pkg/obitools/obiminion/obiminion.go b/pkg/obitools/obiminion/obiminion.go deleted file mode 100644 index 4d939b1..0000000 --- a/pkg/obitools/obiminion/obiminion.go +++ /dev/null @@ -1,280 +0,0 @@ -package obiminion - -import ( - "fmt" - "os" - "sync" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obigraph" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiannotate" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiuniq" - "github.com/schollz/progressbar/v3" - log "github.com/sirupsen/logrus" -) - -// SampleWeight calculates the weight of a sample based on the statistics of a sequence. -// -// Parameters: -// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice) -// - sample: the sample for which the weight is calculated (string) -// - sample_key: the key used to access the sample's statistics (string) -// Return type: a function that takes an integer index and returns the weight of the sample at that index (func(int) int) -func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func(int) float64 { - - f := func(i int) float64 { - - stats := (*seqs)[i].StatsOn(sample_key, "NA") - - if value, ok := stats[sample]; ok { - return float64(value) - } - - return 0 - } - - return f -} - -// SeqBySamples sorts the sequences by samples. -// -// Parameters: -// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice) -// - sample_key: a string representing the sample key (string) -// -// Return type: -// - map[string]BioSequenceSlice: a map indexed by sample names, each containing a slice of BioSequence objects (map[string]BioSequenceSlice) -func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*obiseq.BioSequenceSlice { - - samples := make(map[string]*obiseq.BioSequenceSlice) - - for _, s := range seqs { - if s.HasStatsOn(sample_key) { - stats := s.StatsOn(sample_key, "NA") - for k := range stats { - if seqset, ok := samples[k]; ok { - *seqset = append(*seqset, s) - samples[k] = seqset - } else { - samples[k] = &obiseq.BioSequenceSlice{s} - } - } - } else { - if k, ok := s.GetStringAttribute(sample_key); ok { - if seqset, ok := samples[k]; ok { - *seqset = append(*seqset, s) - samples[k] = seqset - } else { - samples[k] = &obiseq.BioSequenceSlice{s} - } - } - } - } - - return samples - -} - -type Mutation struct { - Position int - SeqA byte - SeqB byte - Ratio float64 -} - -func BuildDiffSeqGraph(name, name_key string, - seqs *obiseq.BioSequenceSlice, - distmax, nworkers int) *obigraph.Graph[*obiseq.BioSequence, Mutation] { - graph := obigraph.NewGraphBuffer[*obiseq.BioSequence, Mutation](name, (*[]*obiseq.BioSequence)(seqs)) - iseq := make(chan int) - defer graph.Close() - - ls := len(*seqs) - - sw := SampleWeight(seqs, name, name_key) - graph.Graph.VertexWeight = sw - - waiting := sync.WaitGroup{} - waiting.Add(nworkers) - - bar := (*progressbar.ProgressBar)(nil) - if obiconvert.CLIProgressBar() { - - pbopt := make([]progressbar.Option, 0, 5) - pbopt = append(pbopt, - progressbar.OptionSetWriter(os.Stderr), - progressbar.OptionSetWidth(15), - progressbar.OptionShowIts(), - progressbar.OptionSetPredictTime(true), - progressbar.OptionSetDescription(fmt.Sprintf("[Build graph] on %s", name)), - ) - - bar = progressbar.NewOptions(len(*seqs), pbopt...) - } - - computeEdges := func() { - defer waiting.Done() - for i := range iseq { - s1 := (*seqs)[i] - for j := i + 1; j < ls; j++ { - s2 := (*seqs)[j] - ratio := sw(i) / sw(j) - ok, pos, a1, a2 := obialign.D1Or0(s1, s2) - if ok >= 0 { - graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio}) - } else if distmax > 1 { - lcs, lali := obialign.FastLCSScore(s1, s2, distmax, nil) - dist := lali - lcs - if lcs > 0 && dist <= distmax { - // log.Infof("Seq %s and %s: LCSScore: %d, dist: %d\n", s1.Id(), s2.Id(), lcs, dist) - graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio}) - } - } - } - - if bar != nil { - bar.Add(1) - } - } - } - - for i := 0; i < nworkers; i++ { - go computeEdges() - } - - for i := 0; i < ls; i++ { - iseq <- i - } - close(iseq) - - waiting.Wait() - return graph.Graph -} - -func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation], - sample_key string, kmer_size int) obiseq.BioSequenceSlice { - denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices)) - - for i, v := range *graph.Vertices { - var err error - var clean *obiseq.BioSequence - degree := graph.Degree(i) - if degree > 4 { - pack := obiseq.MakeBioSequenceSlice(degree + 1) - for k,j := range graph.Neighbors(i) { - pack[k] = (*graph.Vertices)[j] - } - pack[degree] = v - clean, err = obiconsensus.BuildConsensus(pack, - fmt.Sprintf("%s_consensus", v.Id()), - kmer_size, - CLISaveGraphToFiles(), CLIGraphFilesDirectory()) - - if err != nil { - log.Warning(err) - clean = (*graph.Vertices)[i] - clean.SetAttribute("obiminion_consensus", false) - } else { - clean.SetAttribute("obiminion_consensus", true) - } - pack.Recycle(false) - } else { - clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition()) - clean.SetAttribute("obiminion_consensus", false) - } - - clean.SetCount(int(graph.VertexWeight(i))) - clean.SetAttribute(sample_key, graph.Name) - - denoised[i] = clean - } - - return denoised -} -func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence { - dirname := CLIGraphFilesDirectory() - newIter := obiiter.MakeIBioSequence() - - db := itertator.Load() - - log.Infof("Sequence dataset of %d sequeences loaded\n", len(db)) - - samples := SeqBySamples(db, CLISampleAttribute()) - db.Recycle(false) - - log.Infof("Dataset composed of %d samples\n", len(samples)) - - if CLISaveGraphToFiles() { - if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() { - // path does not exist or is not directory - os.RemoveAll(dirname) - err := os.Mkdir(dirname, 0755) - - if err != nil { - log.Panicf("Cannot create directory %s for saving graphs", dirname) - } - } - } - - bar := (*progressbar.ProgressBar)(nil) - if obiconvert.CLIProgressBar() { - - pbopt := make([]progressbar.Option, 0, 5) - pbopt = append(pbopt, - progressbar.OptionSetWriter(os.Stderr), - progressbar.OptionSetWidth(15), - progressbar.OptionShowIts(), - progressbar.OptionSetPredictTime(true), - progressbar.OptionSetDescription("[Filter graph on abundance ratio]"), - ) - - bar = progressbar.NewOptions(len(samples), pbopt...) - } - - newIter.Add(1) - - go func() { - sample_order := 0 - for sample, seqs := range samples { - graph := BuildDiffSeqGraph(sample, - CLISampleAttribute(), - seqs, - CLIDistStepMax(), - obioptions.CLIParallelWorkers()) - if bar != nil { - bar.Add(1) - } - - if CLISaveGraphToFiles() { - graph.WriteGmlFile(fmt.Sprintf("%s/%s.gml", - CLIGraphFilesDirectory(), - sample), - false, 1, 0, 3) - } - - denoised := MinionDenoise(graph, - CLISampleAttribute(), - CLIKmerSize()) - - newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised)) - - sample_order++ - } - - newIter.Done() - }() - - go func() { - newIter.WaitAndClose() - }() - - obiuniq.AddStatsOn(CLISampleAttribute()) - obiuniq.SetUniqueInMemory(false) - obiuniq.SetNoSingleton(CLINoSingleton()) - return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false)) -} diff --git a/pkg/obitools/obiminion/options.go b/pkg/obitools/obiminion/options.go deleted file mode 100644 index d0644fa..0000000 --- a/pkg/obitools/obiminion/options.go +++ /dev/null @@ -1,132 +0,0 @@ -package obiminion - -import ( - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "github.com/DavidGamba/go-getoptions" -) - -var _distStepMax = 1 -var _sampleAttribute = "sample" - -var _ratioMax = 1.0 - -var _clusterMode = false -var _onlyHead = false - -var _kmerSize = -1 - -var _NoSingleton = false - -var _saveGraph = "__@@NOSAVE@@__" -var _saveRatio = "__@@NOSAVE@@__" - -// ObiminionOptionSet sets the options for obiminion. -// -// options: The options for configuring obiminion. -func ObiminionOptionSet(options *getoptions.GetOpt) { - options.StringVar(&_sampleAttribute, "sample", _sampleAttribute, - options.Alias("s"), - options.Description("Attribute containing sample descriptions (default %s).")) - - options.IntVar(&_distStepMax, "distance", _distStepMax, - options.Alias("d"), - options.Description("Maximum numbers of differences between two variant sequences (default: %d).")) - - options.StringVar(&_saveGraph, "save-graph", _saveGraph, - options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+ - "The graph files follow the graphml format."), - ) - - options.StringVar(&_saveRatio, "save-ratio", _saveRatio, - options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+ - "The ratio file follows the csv format."), - ) - options.IntVar(&_kmerSize, "kmer-size", _kmerSize, - options.ArgName("SIZE"), - options.Description("The size of the kmer used to build the consensus. "+ - "Default value = -1, which means that the kmer size is estimated from the data"), - ) - - options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton, - options.Description("If set, sequences occurring a single time in the data set are discarded.")) - -} - -// OptionSet sets up the options for the obiminion package. -// -// It takes a pointer to a getoptions.GetOpt object as a parameter. -// It does not return any value. -func OptionSet(options *getoptions.GetOpt) { - obiconvert.InputOptionSet(options) - obiconvert.OutputOptionSet(options) - ObiminionOptionSet(options) -} - -// CLIDistStepMax returns the maximum distance between two sequences. -// -// The value of the distance is set by the user with the `-d` flag. -// -// No parameters. -// Returns an integer. -func CLIDistStepMax() int { - return _distStepMax -} - -// CLISampleAttribute returns the name of the attribute used to store sample name. -// -// The value of the sample attribute is set by the user with the `-s` flag. -// -// No parameters. -// Returns a string. -func CLISampleAttribute() string { - return _sampleAttribute -} - -func ClusterMode() bool { - return _clusterMode -} - -// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program -func OnlyHead() bool { - return _onlyHead -} - -// Returns true it the obliclean graphs must be saved -func CLISaveGraphToFiles() bool { - return _saveGraph != "__@@NOSAVE@@__" -} - -// It returns the directory where the graph files are saved -func CLIGraphFilesDirectory() string { - return _saveGraph -} - -// Returns true it the table of ratio must be saved -func IsSaveRatioTable() bool { - return _saveRatio != "__@@NOSAVE@@__" -} - -// It returns the filename of the file that stores the ratio table -func RatioTableFilename() string { - return _saveRatio -} - -// CLIKmerSize returns the value of the kmer size to use for building the consensus. -// -// The value of the kmer size is set by the user with the `-k` flag. -// The value -1 means that the kmer size is estimated as the minimum value that -// insure that no kmer are present more than one time in a sequence. -// -// No parameters. -// Returns an integer value. -func CLIKmerSize() int { - return _kmerSize -} - -// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded. -// -// No parameters. -// Returns a boolean value indicating whether or not singleton sequences should be discarded. -func CLINoSingleton() bool { - return _NoSingleton -} \ No newline at end of file