mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-10 09:40:27 +00:00
Add obiminion first version
Former-commit-id: aa5ace7bd4d2266333715fca7094d1c3cbbb5e6d
This commit is contained in:
@@ -4,92 +4,93 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"sort"
|
||||
"slices"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obisuffix"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
consensus_id string,
|
||||
kmer_size int, quorum float64,
|
||||
min_depth float64,
|
||||
max_length int,
|
||||
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
|
||||
|
||||
if save_graph {
|
||||
if dirname == "" {
|
||||
dirname = "."
|
||||
}
|
||||
|
||||
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
|
||||
// path does not exist or is not directory
|
||||
os.RemoveAll(dirname)
|
||||
err := os.Mkdir(dirname, 0755)
|
||||
|
||||
if err != nil {
|
||||
log.Panicf("Cannot create directory %s for saving graphs", dirname)
|
||||
}
|
||||
}
|
||||
|
||||
fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s.fasta", consensus_id)))
|
||||
|
||||
if err == nil {
|
||||
defer fasta.Close()
|
||||
fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(0, seqs), obiformats.FormatFastSeqJsonHeader, false))
|
||||
fasta.Close()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
log.Printf("Number of reads : %d\n", len(seqs))
|
||||
|
||||
if kmer_size < 0 {
|
||||
longest := make([]int, len(seqs))
|
||||
|
||||
for i := range seqs {
|
||||
s := seqs[i : i+1]
|
||||
for i, seq := range seqs {
|
||||
s := obiseq.BioSequenceSlice{seq}
|
||||
sa := obisuffix.BuildSuffixArray(&s)
|
||||
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
|
||||
longest[i] = slices.Max(sa.CommonSuffix())
|
||||
}
|
||||
|
||||
o := obiutils.Order(sort.IntSlice(longest))
|
||||
i := int(float64(len(seqs)) * quorum)
|
||||
// o := obiutils.Order(sort.IntSlice(longest))
|
||||
// i := int(float64(len(seqs)) * quorum)
|
||||
|
||||
kmer_size = longest[o[i]] + 1
|
||||
// if i >= len(o) {
|
||||
// i = len(o) - 1
|
||||
// }
|
||||
|
||||
kmer_size = slices.Max(longest) + 1
|
||||
|
||||
// kmer_size = longest[o[i]] + 1
|
||||
log.Printf("estimated kmer size : %d", kmer_size)
|
||||
}
|
||||
|
||||
graph := obikmer.MakeDeBruijnGraph(kmer_size)
|
||||
var graph *obikmer.DeBruijnGraph
|
||||
for {
|
||||
graph = obikmer.MakeDeBruijnGraph(kmer_size)
|
||||
|
||||
for _, s := range seqs {
|
||||
graph.Push(s)
|
||||
for _, s := range seqs {
|
||||
graph.Push(s)
|
||||
}
|
||||
|
||||
if !graph.HasCycle() {
|
||||
break
|
||||
}
|
||||
|
||||
kmer_size++
|
||||
log.Infof("Cycle detected, increasing kmer size to %d\n", kmer_size)
|
||||
}
|
||||
|
||||
log.Printf("Graph size : %d\n", graph.Len())
|
||||
total_kmer := graph.Len()
|
||||
|
||||
threshold := 0
|
||||
|
||||
switch {
|
||||
case min_depth < 0:
|
||||
spectrum := graph.LinkSpectrum()
|
||||
cum := make(map[int]int)
|
||||
|
||||
spectrum[1] = 0
|
||||
for i := 2; i < len(spectrum); i++ {
|
||||
spectrum[i] += spectrum[i-1]
|
||||
cum[spectrum[i]]++
|
||||
}
|
||||
|
||||
max := 0
|
||||
kmax := 0
|
||||
for k, obs := range cum {
|
||||
if obs > max {
|
||||
max = obs
|
||||
kmax = k
|
||||
}
|
||||
}
|
||||
|
||||
for i, total := range spectrum {
|
||||
if total == kmax {
|
||||
threshold = i
|
||||
break
|
||||
}
|
||||
}
|
||||
threshold /= 2
|
||||
case min_depth >= 1:
|
||||
threshold = int(min_depth)
|
||||
default:
|
||||
threshold = int(float64(len(seqs)) * min_depth)
|
||||
}
|
||||
|
||||
graph.FilterMin(threshold)
|
||||
|
||||
log.Printf("Graph size : %d\n", graph.Len())
|
||||
|
||||
if save_graph {
|
||||
|
||||
file, err := os.Create(path.Join(dirname,
|
||||
fmt.Sprintf("%s.gml", seqs[0].Source())))
|
||||
fmt.Sprintf("%s_raw_consensus.gml", consensus_id)))
|
||||
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
@@ -99,29 +100,133 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
}
|
||||
}
|
||||
|
||||
id := seqs[0].Source()
|
||||
if id == "" {
|
||||
id = seqs[0].Id()
|
||||
}
|
||||
seq, err := graph.LongestConsensus(id)
|
||||
log.Printf("Graph size : %d\n", graph.Len())
|
||||
total_kmer := graph.Len()
|
||||
|
||||
// threshold := 0
|
||||
|
||||
// switch {
|
||||
// case min_depth < 0:
|
||||
// spectrum := graph.WeightSpectrum()
|
||||
// cum := make(map[int]int)
|
||||
|
||||
// spectrum[1] = 0
|
||||
// for i := 2; i < len(spectrum); i++ {
|
||||
// spectrum[i] += spectrum[i-1]
|
||||
// cum[spectrum[i]]++
|
||||
// }
|
||||
|
||||
// max := 0
|
||||
// kmax := 0
|
||||
// for k, obs := range cum {
|
||||
// if obs > max {
|
||||
// max = obs
|
||||
// kmax = k
|
||||
// }
|
||||
// }
|
||||
|
||||
// for i, total := range spectrum {
|
||||
// if total == kmax {
|
||||
// threshold = i
|
||||
// break
|
||||
// }
|
||||
// }
|
||||
// threshold /= 2
|
||||
|
||||
// if threshold < 1 {
|
||||
// threshold = 1
|
||||
// }
|
||||
|
||||
// log.Info("Estimated kmer_min_occur = ", threshold)
|
||||
// case min_depth >= 1:
|
||||
// threshold = int(min_depth)
|
||||
// default:
|
||||
// threshold = int(float64(len(seqs)) * min_depth)
|
||||
// }
|
||||
|
||||
// graph.FilterMinWeight(threshold)
|
||||
|
||||
// log.Printf("Graph size : %d\n", graph.Len())
|
||||
|
||||
// if save_graph {
|
||||
|
||||
// file, err := os.Create(path.Join(dirname,
|
||||
// fmt.Sprintf("%s_consensus.gml", consensus_id)))
|
||||
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// } else {
|
||||
// file.WriteString(graph.Gml())
|
||||
// file.Close()
|
||||
// }
|
||||
// }
|
||||
|
||||
seq, err := graph.LongestConsensus(consensus_id, max_length)
|
||||
|
||||
sumCount := 0
|
||||
|
||||
for _, s := range seqs {
|
||||
sumCount += s.Count()
|
||||
if seq != nil {
|
||||
for _, s := range seqs {
|
||||
sumCount += s.Count()
|
||||
}
|
||||
|
||||
seq.SetCount(sumCount)
|
||||
seq.SetAttribute("seq_length", seq.Len())
|
||||
seq.SetAttribute("kmer_size", kmer_size)
|
||||
//seq.SetAttribute("kmer_min_occur", threshold)
|
||||
seq.SetAttribute("kmer_max_occur", graph.MaxWeight())
|
||||
seq.SetAttribute("filtered_graph_size", graph.Len())
|
||||
seq.SetAttribute("full_graph_size", total_kmer)
|
||||
}
|
||||
|
||||
seq.SetCount(sumCount)
|
||||
seq.SetAttribute("seq_length", seq.Len())
|
||||
seq.SetAttribute("kmer_size", kmer_size)
|
||||
seq.SetAttribute("kmer_min_occur", threshold)
|
||||
seq.SetAttribute("kmer_max_occur", graph.MaxLink())
|
||||
seq.SetAttribute("filtered_graph_size", graph.Len())
|
||||
seq.SetAttribute("full_graph_size", total_kmer)
|
||||
|
||||
return seq, err
|
||||
}
|
||||
|
||||
// func BuildConsensusWithTimeout(seqs obiseq.BioSequenceSlice,
|
||||
// kmer_size int, quorum float64,
|
||||
// min_depth float64,
|
||||
// save_graph bool, dirname string, timeout time.Duration) (*obiseq.BioSequence, error) {
|
||||
|
||||
// ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
// defer cancel()
|
||||
|
||||
// consensus := func() *obiseq.BioSequence {
|
||||
// cons, err := BuildConsensus(seqs, kmer_size, quorum, min_depth, save_graph, dirname,)
|
||||
// if err != nil {
|
||||
// cons = nil
|
||||
// }
|
||||
|
||||
// return cons
|
||||
// }
|
||||
|
||||
// computation := func() <-chan *obiseq.BioSequence {
|
||||
// result := make(chan *obiseq.BioSequence)
|
||||
|
||||
// go func() {
|
||||
// select {
|
||||
// case <-ctx.Done():
|
||||
// result <- nil
|
||||
// default:
|
||||
// result <- consensus()
|
||||
|
||||
// }
|
||||
// }()
|
||||
|
||||
// return result
|
||||
// }
|
||||
|
||||
// calcResult := computation()
|
||||
|
||||
// select {
|
||||
// case result := <-calcResult:
|
||||
// if result == nil {
|
||||
// return nil, fmt.Errorf("cannot compute consensus")
|
||||
// }
|
||||
// return result, nil
|
||||
// case <-ctx.Done():
|
||||
// return nil, fmt.Errorf("compute consensus timeout, exiting")
|
||||
// }
|
||||
// }
|
||||
|
||||
func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
size := 10
|
||||
@@ -153,10 +258,19 @@ func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
|
||||
consensus, err := BuildConsensus(seqs.Slice(),
|
||||
sequences := seqs.Slice()
|
||||
|
||||
id := sequences[0].Source()
|
||||
if id == "" {
|
||||
id = sequences[0].Id()
|
||||
}
|
||||
consensus, err := BuildConsensus(sequences,
|
||||
id,
|
||||
CLIKmerSize(), CLIThreshold(),
|
||||
CLIKmerDepth(),
|
||||
CLISaveGraphToFiles(), CLIGraphFilesDirectory(),
|
||||
CLIMaxConsensusLength(),
|
||||
CLISaveGraphToFiles(),
|
||||
CLIGraphFilesDirectory(),
|
||||
)
|
||||
|
||||
if err == nil {
|
||||
|
||||
@@ -9,6 +9,7 @@ var _saveGraph = "__@@NOSAVE@@__"
|
||||
var _kmerSize = -1
|
||||
var _threshold = 0.99
|
||||
var _mindepth = -1.0
|
||||
var _consensus_max_length = -1
|
||||
|
||||
func ObiconsensusOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
@@ -38,6 +39,12 @@ func ObiconsensusOptionSet(options *getoptions.GetOpt) {
|
||||
"Default value = -1, which means that the DEPTH is estimated from the data"),
|
||||
)
|
||||
|
||||
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
|
||||
options.ArgName("LENGTH"),
|
||||
options.Description("Maximum length of the consensus sequence. "+
|
||||
"Default value = -1, which means that no limit is applied"),
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
@@ -67,3 +74,7 @@ func CLIKmerDepth() float64 {
|
||||
func CLIThreshold() float64 {
|
||||
return _threshold
|
||||
}
|
||||
|
||||
func CLIMaxConsensusLength() int {
|
||||
return _consensus_max_length
|
||||
}
|
||||
|
||||
290
pkg/obitools/obiminion/obiminion.go
Normal file
290
pkg/obitools/obiminion/obiminion.go
Normal file
@@ -0,0 +1,290 @@
|
||||
package obiminion
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obigraph"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiannotate"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiuniq"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// SampleWeight calculates the weight of a sample based on the statistics of a sequence.
|
||||
//
|
||||
// Parameters:
|
||||
// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice)
|
||||
// - sample: the sample for which the weight is calculated (string)
|
||||
// - sample_key: the key used to access the sample's statistics (string)
|
||||
// Return type: a function that takes an integer index and returns the weight of the sample at that index (func(int) int)
|
||||
func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func(int) float64 {
|
||||
|
||||
f := func(i int) float64 {
|
||||
|
||||
stats := (*seqs)[i].StatsOn(sample_key, "NA")
|
||||
|
||||
if value, ok := stats[sample]; ok {
|
||||
return float64(value)
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
// SeqBySamples sorts the sequences by samples.
|
||||
//
|
||||
// Parameters:
|
||||
// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice)
|
||||
// - sample_key: a string representing the sample key (string)
|
||||
//
|
||||
// Return type:
|
||||
// - map[string]BioSequenceSlice: a map indexed by sample names, each containing a slice of BioSequence objects (map[string]BioSequenceSlice)
|
||||
func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*obiseq.BioSequenceSlice {
|
||||
|
||||
samples := make(map[string]*obiseq.BioSequenceSlice)
|
||||
|
||||
for _, s := range seqs {
|
||||
if s.HasStatsOn(sample_key) {
|
||||
stats := s.StatsOn(sample_key, "NA")
|
||||
for k := range stats {
|
||||
if seqset, ok := samples[k]; ok {
|
||||
*seqset = append(*seqset, s)
|
||||
samples[k] = seqset
|
||||
} else {
|
||||
samples[k] = &obiseq.BioSequenceSlice{s}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if k, ok := s.GetStringAttribute(sample_key); ok {
|
||||
if seqset, ok := samples[k]; ok {
|
||||
*seqset = append(*seqset, s)
|
||||
samples[k] = seqset
|
||||
} else {
|
||||
samples[k] = &obiseq.BioSequenceSlice{s}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return samples
|
||||
|
||||
}
|
||||
|
||||
type Mutation struct {
|
||||
Position int
|
||||
SeqA byte
|
||||
SeqB byte
|
||||
Ratio float64
|
||||
}
|
||||
|
||||
func BuildDiffSeqGraph(name, name_key string,
|
||||
seqs *obiseq.BioSequenceSlice,
|
||||
distmax, nworkers int) *obigraph.Graph[*obiseq.BioSequence, Mutation] {
|
||||
graph := obigraph.NewGraphBuffer[*obiseq.BioSequence, Mutation](name, (*[]*obiseq.BioSequence)(seqs))
|
||||
iseq := make(chan int)
|
||||
defer graph.Close()
|
||||
|
||||
ls := len(*seqs)
|
||||
|
||||
sw := SampleWeight(seqs, name, name_key)
|
||||
graph.Graph.VertexWeight = sw
|
||||
|
||||
waiting := sync.WaitGroup{}
|
||||
waiting.Add(nworkers)
|
||||
|
||||
bar := (*progressbar.ProgressBar)(nil)
|
||||
if obiconvert.CLIProgressBar() {
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription(fmt.Sprintf("[Build graph] on %s", name)),
|
||||
)
|
||||
|
||||
bar = progressbar.NewOptions(len(*seqs), pbopt...)
|
||||
}
|
||||
|
||||
computeEdges := func() {
|
||||
defer waiting.Done()
|
||||
for i := range iseq {
|
||||
s1 := (*seqs)[i]
|
||||
for j := i + 1; j < ls; j++ {
|
||||
s2 := (*seqs)[j]
|
||||
ratio := sw(i) / sw(j)
|
||||
ok, pos, a1, a2 := obialign.D1Or0(s1, s2)
|
||||
if ok >= 0 {
|
||||
graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio})
|
||||
} else if distmax > 1 {
|
||||
lcs, lali := obialign.FastLCSScore(s1, s2, distmax, nil)
|
||||
dist := lali - lcs
|
||||
if lcs > 0 && dist <= distmax {
|
||||
// log.Infof("Seq %s and %s: LCSScore: %d, dist: %d\n", s1.Id(), s2.Id(), lcs, dist)
|
||||
graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i := 0; i < nworkers; i++ {
|
||||
go computeEdges()
|
||||
}
|
||||
|
||||
for i := 0; i < ls; i++ {
|
||||
iseq <- i
|
||||
}
|
||||
close(iseq)
|
||||
|
||||
waiting.Wait()
|
||||
return graph.Graph
|
||||
}
|
||||
|
||||
func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||
sample_key string, kmer_size int, max_length int, threshold float64, depth float64) obiseq.BioSequenceSlice {
|
||||
denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices))
|
||||
|
||||
for i, v := range *graph.Vertices {
|
||||
var err error
|
||||
var clean *obiseq.BioSequence
|
||||
degree := graph.Degree(i)
|
||||
if degree > 4 {
|
||||
pack := obiseq.MakeBioSequenceSlice(degree + 1)
|
||||
for k,j := range graph.Neighbors(i) {
|
||||
pack[k] = (*graph.Vertices)[j]
|
||||
}
|
||||
pack[degree] = v
|
||||
clean, err = obiconsensus.BuildConsensus(pack,
|
||||
fmt.Sprintf("%s_consensus", v.Id()),
|
||||
kmer_size,
|
||||
threshold,
|
||||
depth, max_length,
|
||||
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
|
||||
|
||||
if err != nil {
|
||||
log.Warning(err)
|
||||
clean = (*graph.Vertices)[i]
|
||||
clean.SetAttribute("obiminion_consensus", false)
|
||||
} else {
|
||||
clean.SetAttribute("obiminion_consensus", true)
|
||||
}
|
||||
pack.Recycle(false)
|
||||
} else {
|
||||
clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition())
|
||||
clean.SetAttribute("obiminion_consensus", false)
|
||||
}
|
||||
|
||||
clean.SetCount(int(graph.VertexWeight(i)))
|
||||
clean.SetAttribute(sample_key, graph.Name)
|
||||
|
||||
denoised[i] = clean
|
||||
}
|
||||
|
||||
return denoised
|
||||
}
|
||||
func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
dirname := CLIGraphFilesDirectory()
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
db := itertator.Load()
|
||||
|
||||
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
|
||||
|
||||
samples := SeqBySamples(db, CLISampleAttribute())
|
||||
db.Recycle(false)
|
||||
|
||||
log.Infof("Dataset composed of %d samples\n", len(samples))
|
||||
if CLIMaxConsensusLength() > 0 {
|
||||
log.Infof("Maximum consensus length: %d\n", CLIMaxConsensusLength())
|
||||
}
|
||||
|
||||
log.Infof("Dataset composed of %d samples\n", len(samples))
|
||||
|
||||
if CLISaveGraphToFiles() {
|
||||
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
|
||||
// path does not exist or is not directory
|
||||
os.RemoveAll(dirname)
|
||||
err := os.Mkdir(dirname, 0755)
|
||||
|
||||
if err != nil {
|
||||
log.Panicf("Cannot create directory %s for saving graphs", dirname)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bar := (*progressbar.ProgressBar)(nil)
|
||||
if obiconvert.CLIProgressBar() {
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription("[Filter graph on abundance ratio]"),
|
||||
)
|
||||
|
||||
bar = progressbar.NewOptions(len(samples), pbopt...)
|
||||
}
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
sample_order := 0
|
||||
for sample, seqs := range samples {
|
||||
graph := BuildDiffSeqGraph(sample,
|
||||
CLISampleAttribute(),
|
||||
seqs,
|
||||
CLIDistStepMax(),
|
||||
obioptions.CLIParallelWorkers())
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
|
||||
if CLISaveGraphToFiles() {
|
||||
graph.WriteGmlFile(fmt.Sprintf("%s/%s.gml",
|
||||
CLIGraphFilesDirectory(),
|
||||
sample),
|
||||
false, 1, 0, 3)
|
||||
}
|
||||
|
||||
denoised := MinionDenoise(graph,
|
||||
CLISampleAttribute(),
|
||||
CLIKmerSize(),
|
||||
CLIMaxConsensusLength(),
|
||||
CLIThreshold(),
|
||||
CLIKmerDepth())
|
||||
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised))
|
||||
|
||||
sample_order++
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
obiuniq.AddStatsOn(CLISampleAttribute())
|
||||
obiuniq.SetUniqueInMemory(false)
|
||||
obiuniq.SetNoSingleton(CLINoSingleton())
|
||||
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
|
||||
}
|
||||
179
pkg/obitools/obiminion/options.go
Normal file
179
pkg/obitools/obiminion/options.go
Normal file
@@ -0,0 +1,179 @@
|
||||
package obiminion
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _distStepMax = 1
|
||||
var _sampleAttribute = "sample"
|
||||
|
||||
var _ratioMax = 1.0
|
||||
var _minEvalRate = 1000
|
||||
|
||||
var _clusterMode = false
|
||||
var _onlyHead = false
|
||||
|
||||
var _kmerSize = -1
|
||||
var _threshold = 1.0
|
||||
var _mindepth = -1.0
|
||||
|
||||
var _consensus_max_length = 1000
|
||||
|
||||
var _NoSingleton = false
|
||||
|
||||
var _saveGraph = "__@@NOSAVE@@__"
|
||||
var _saveRatio = "__@@NOSAVE@@__"
|
||||
|
||||
// ObiminionOptionSet sets the options for obiminion.
|
||||
//
|
||||
// options: The options for configuring obiminion.
|
||||
func ObiminionOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
|
||||
options.Alias("s"),
|
||||
options.Description("Attribute containing sample descriptions (default %s)."))
|
||||
|
||||
options.IntVar(&_distStepMax, "distance", _distStepMax,
|
||||
options.Alias("d"),
|
||||
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
|
||||
|
||||
options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate,
|
||||
options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate."))
|
||||
|
||||
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
|
||||
options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
|
||||
"The graph files follow the graphml format."),
|
||||
)
|
||||
|
||||
options.StringVar(&_saveRatio, "save-ratio", _saveRatio,
|
||||
options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+
|
||||
"The ratio file follows the csv format."),
|
||||
)
|
||||
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
|
||||
options.ArgName("SIZE"),
|
||||
options.Description("The size of the kmer used to build the consensus. "+
|
||||
"Default value = -1, which means that the kmer size is estimated from the data"),
|
||||
)
|
||||
|
||||
options.Float64Var(&_threshold, "threshold", _threshold,
|
||||
options.ArgName("RATIO"),
|
||||
options.Description("A threshold between O and 1 used to determine the optimal "+
|
||||
"kmer size"),
|
||||
)
|
||||
|
||||
options.Float64Var(&_mindepth, "min-depth", _mindepth,
|
||||
options.ArgName("DEPTH"),
|
||||
options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+
|
||||
"reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+
|
||||
"than 1, indicate the minimum count of occurrence for a kmer to be kept. "+
|
||||
"Default value = -1, which means that the DEPTH is estimated from the data"),
|
||||
)
|
||||
|
||||
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
|
||||
options.ArgName("LENGTH"),
|
||||
options.Description("Maximum length of the consensus sequence. "+
|
||||
"Default value = -1, which means that no limit is applied"),
|
||||
)
|
||||
|
||||
options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton,
|
||||
options.Description("If set, sequences occurring a single time in the data set are discarded."))
|
||||
|
||||
|
||||
}
|
||||
|
||||
// OptionSet sets up the options for the obiminion package.
|
||||
//
|
||||
// It takes a pointer to a getoptions.GetOpt object as a parameter.
|
||||
// It does not return any value.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.InputOptionSet(options)
|
||||
obiconvert.OutputOptionSet(options)
|
||||
ObiminionOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIDistStepMax returns the maximum distance between two sequences.
|
||||
//
|
||||
// The value of the distance is set by the user with the `-d` flag.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer.
|
||||
func CLIDistStepMax() int {
|
||||
return _distStepMax
|
||||
}
|
||||
|
||||
// CLISampleAttribute returns the name of the attribute used to store sample name.
|
||||
//
|
||||
// The value of the sample attribute is set by the user with the `-s` flag.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns a string.
|
||||
func CLISampleAttribute() string {
|
||||
return _sampleAttribute
|
||||
}
|
||||
|
||||
// > The function `CLIMinCountToEvalMutationRate()` returns the minimum number of reads that must be
|
||||
// observed before the mutation rate can be evaluated
|
||||
func CLIMinCountToEvalMutationRate() int {
|
||||
return _minEvalRate
|
||||
}
|
||||
|
||||
func ClusterMode() bool {
|
||||
return _clusterMode
|
||||
}
|
||||
|
||||
// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program
|
||||
func OnlyHead() bool {
|
||||
return _onlyHead
|
||||
}
|
||||
|
||||
// Returns true it the obliclean graphs must be saved
|
||||
func CLISaveGraphToFiles() bool {
|
||||
return _saveGraph != "__@@NOSAVE@@__"
|
||||
}
|
||||
|
||||
// It returns the directory where the graph files are saved
|
||||
func CLIGraphFilesDirectory() string {
|
||||
return _saveGraph
|
||||
}
|
||||
|
||||
// Returns true it the table of ratio must be saved
|
||||
func IsSaveRatioTable() bool {
|
||||
return _saveRatio != "__@@NOSAVE@@__"
|
||||
}
|
||||
|
||||
// It returns the filename of the file that stores the ratio table
|
||||
func RatioTableFilename() string {
|
||||
return _saveRatio
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the value of the kmer size to use for building the consensus.
|
||||
//
|
||||
// The value of the kmer size is set by the user with the `-k` flag.
|
||||
// The value -1 means that the kmer size is estimated as the minimum value that
|
||||
// insure that no kmer are present more than one time in a sequence.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer value.
|
||||
func CLIKmerSize() int {
|
||||
return _kmerSize
|
||||
}
|
||||
|
||||
func CLIKmerDepth() float64 {
|
||||
return _mindepth
|
||||
}
|
||||
|
||||
func CLIThreshold() float64 {
|
||||
return _threshold
|
||||
}
|
||||
|
||||
func CLIMaxConsensusLength() int {
|
||||
return _consensus_max_length
|
||||
}
|
||||
|
||||
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||
func CLINoSingleton() bool {
|
||||
return _NoSingleton
|
||||
}
|
||||
@@ -47,8 +47,8 @@ func CLIPCR(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error) {
|
||||
frags := obiiter.IFragments(
|
||||
CLIMaxLength()*1000,
|
||||
CLIMaxLength()*100,
|
||||
CLIMaxLength()+obiutils.MaxInt(len(CLIForwardPrimer()),
|
||||
len(CLIReversePrimer()))+obiutils.MinInt(len(CLIForwardPrimer()),
|
||||
CLIMaxLength()+obiutils.Max(len(CLIForwardPrimer()),
|
||||
len(CLIReversePrimer()))+obiutils.Min(len(CLIForwardPrimer()),
|
||||
len(CLIReversePrimer()))/2,
|
||||
100,
|
||||
obioptions.CLIParallelWorkers(),
|
||||
|
||||
@@ -63,7 +63,7 @@ func IndexSequence(seqidx int,
|
||||
if lca[order] == ancestor {
|
||||
// nseq[i]++
|
||||
if mini != -1 {
|
||||
wordmin = obiutils.MaxInt(sequence.Len(), references[order].Len()) - 3 - 4*mini
|
||||
wordmin = obiutils.Max(sequence.Len(), references[order].Len()) - 3 - 4*mini
|
||||
}
|
||||
|
||||
if cw[order] < wordmin {
|
||||
@@ -189,7 +189,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
indexed := obiiter.MakeIBioSequence()
|
||||
go func() {
|
||||
for i := 0; i < len(references); i += 10 {
|
||||
limits <- [2]int{i, obiutils.MinInt(i+10, len(references))}
|
||||
limits <- [2]int{i, obiutils.Min(i+10, len(references))}
|
||||
}
|
||||
close(limits)
|
||||
}()
|
||||
|
||||
@@ -110,7 +110,7 @@ func FindClosests(sequence *obiseq.BioSequence,
|
||||
d, _, _, _ := obialign.D1Or0(sequence, references[order])
|
||||
if d >= 0 {
|
||||
score = d
|
||||
alilength = obiutils.MaxInt(sequence.Len(), ref.Len())
|
||||
alilength = obiutils.Max(sequence.Len(), ref.Len())
|
||||
lcs = alilength - score
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -12,6 +12,11 @@ var _chunks = 100
|
||||
var _NAValue = "NA"
|
||||
var _NoSingleton = false
|
||||
|
||||
// UniqueOptionSet sets up unique options for the obiuniq command.
|
||||
//
|
||||
// It configures various options such as merging attributes, category attributes,
|
||||
// defining the NA value, handling singleton sequences, choosing between in-memory
|
||||
// or disk storage, and specifying the chunk count for dataset division.
|
||||
func UniqueOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringSliceVar(&_StatsOn, "merge",
|
||||
1, 1,
|
||||
@@ -40,25 +45,67 @@ func UniqueOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obiuniq command
|
||||
//
|
||||
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
UniqueOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIStatsOn returns the list of variables on witch statistics are computed.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns a slice of strings representing the statistics on values.
|
||||
func CLIStatsOn() []string {
|
||||
return _StatsOn
|
||||
}
|
||||
|
||||
// SetStatsOn sets the list of variables on witch statistics are computed.
|
||||
//
|
||||
// It takes a slice of strings as its parameter and does not return anything.
|
||||
func SetStatsOn(statsOn []string) {
|
||||
_StatsOn = statsOn
|
||||
}
|
||||
|
||||
// AddStatsOn adds a variable to the list of variables on witch statistics are computed.
|
||||
//
|
||||
// Parameters:
|
||||
// - statsOn: variadic strings representing the statistics to be added.
|
||||
func AddStatsOn(statsOn ...string) {
|
||||
_StatsOn = append(_StatsOn, statsOn...)
|
||||
}
|
||||
|
||||
// CLIKeys returns the keys used to distinguished among identical sequences.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns a slice of strings representing the keys used by the CLI.
|
||||
func CLIKeys() []string {
|
||||
return _Keys
|
||||
}
|
||||
|
||||
// CLIUniqueInMemory returns if the unique function is running in memory only.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns a boolean value indicating whether the function is running in memory or not.
|
||||
func CLIUniqueInMemory() bool {
|
||||
return _InMemory
|
||||
}
|
||||
|
||||
// SetUniqueInMemory sets whether the unique function is running in memory or not.
|
||||
//
|
||||
// inMemory bool - A boolean value indicating whether the function is running in memory.
|
||||
// No return value.
|
||||
func SetUniqueInMemory(inMemory bool) {
|
||||
_InMemory = inMemory
|
||||
}
|
||||
|
||||
// CLINumberOfChunks returns the number of chunks used for the first bucket sort step used by the unique function.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns an integer representing the number of chunks.
|
||||
func CLINumberOfChunks() int {
|
||||
if _chunks <= 1 {
|
||||
return 1
|
||||
@@ -67,10 +114,40 @@ func CLINumberOfChunks() int {
|
||||
return _chunks
|
||||
}
|
||||
|
||||
// SetNumberOfChunks sets the number of chunks used for the first bucket sort step used by the unique function.
|
||||
//
|
||||
// chunks int - The number of chunks to be set.
|
||||
// No return value.
|
||||
func SetNumberOfChunks(chunks int) {
|
||||
_chunks = chunks
|
||||
}
|
||||
|
||||
// CLINAValue returns the value used as a placeholder for missing values.
|
||||
//
|
||||
// No parameters.
|
||||
// Return type: string.
|
||||
func CLINAValue() string {
|
||||
return _NAValue
|
||||
}
|
||||
|
||||
// SetNAValue sets the NA value to the specified string.
|
||||
//
|
||||
// value string - The value to set as the NA value.
|
||||
func SetNAValue(value string) {
|
||||
_NAValue = value
|
||||
}
|
||||
|
||||
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||
func CLINoSingleton() bool {
|
||||
return _NoSingleton
|
||||
}
|
||||
|
||||
// SetNoSingleton sets the boolean value indicating whether or not singleton sequences should be discarded.
|
||||
//
|
||||
// noSingleton bool - The boolean value to set for _NoSingleton.
|
||||
func SetNoSingleton(noSingleton bool) {
|
||||
_NoSingleton = noSingleton
|
||||
}
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
)
|
||||
|
||||
func Unique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
func CLIUnique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
options := make([]obichunk.WithOption, 0, 30)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user