Add obiminion first version

Former-commit-id: aa5ace7bd4d2266333715fca7094d1c3cbbb5e6d
This commit is contained in:
Eric Coissac
2024-05-14 08:16:12 +02:00
parent 9e63013bc2
commit 017030bcce
24 changed files with 1599 additions and 469 deletions

View File

@@ -4,92 +4,93 @@ import (
"fmt"
"os"
"path"
"sort"
"slices"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obisuffix"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
func BuildConsensus(seqs obiseq.BioSequenceSlice,
consensus_id string,
kmer_size int, quorum float64,
min_depth float64,
max_length int,
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
if save_graph {
if dirname == "" {
dirname = "."
}
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
// path does not exist or is not directory
os.RemoveAll(dirname)
err := os.Mkdir(dirname, 0755)
if err != nil {
log.Panicf("Cannot create directory %s for saving graphs", dirname)
}
}
fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s.fasta", consensus_id)))
if err == nil {
defer fasta.Close()
fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(0, seqs), obiformats.FormatFastSeqJsonHeader, false))
fasta.Close()
}
}
log.Printf("Number of reads : %d\n", len(seqs))
if kmer_size < 0 {
longest := make([]int, len(seqs))
for i := range seqs {
s := seqs[i : i+1]
for i, seq := range seqs {
s := obiseq.BioSequenceSlice{seq}
sa := obisuffix.BuildSuffixArray(&s)
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
longest[i] = slices.Max(sa.CommonSuffix())
}
o := obiutils.Order(sort.IntSlice(longest))
i := int(float64(len(seqs)) * quorum)
// o := obiutils.Order(sort.IntSlice(longest))
// i := int(float64(len(seqs)) * quorum)
kmer_size = longest[o[i]] + 1
// if i >= len(o) {
// i = len(o) - 1
// }
kmer_size = slices.Max(longest) + 1
// kmer_size = longest[o[i]] + 1
log.Printf("estimated kmer size : %d", kmer_size)
}
graph := obikmer.MakeDeBruijnGraph(kmer_size)
var graph *obikmer.DeBruijnGraph
for {
graph = obikmer.MakeDeBruijnGraph(kmer_size)
for _, s := range seqs {
graph.Push(s)
for _, s := range seqs {
graph.Push(s)
}
if !graph.HasCycle() {
break
}
kmer_size++
log.Infof("Cycle detected, increasing kmer size to %d\n", kmer_size)
}
log.Printf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
threshold := 0
switch {
case min_depth < 0:
spectrum := graph.LinkSpectrum()
cum := make(map[int]int)
spectrum[1] = 0
for i := 2; i < len(spectrum); i++ {
spectrum[i] += spectrum[i-1]
cum[spectrum[i]]++
}
max := 0
kmax := 0
for k, obs := range cum {
if obs > max {
max = obs
kmax = k
}
}
for i, total := range spectrum {
if total == kmax {
threshold = i
break
}
}
threshold /= 2
case min_depth >= 1:
threshold = int(min_depth)
default:
threshold = int(float64(len(seqs)) * min_depth)
}
graph.FilterMin(threshold)
log.Printf("Graph size : %d\n", graph.Len())
if save_graph {
file, err := os.Create(path.Join(dirname,
fmt.Sprintf("%s.gml", seqs[0].Source())))
fmt.Sprintf("%s_raw_consensus.gml", consensus_id)))
if err != nil {
fmt.Println(err)
@@ -99,29 +100,133 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
}
}
id := seqs[0].Source()
if id == "" {
id = seqs[0].Id()
}
seq, err := graph.LongestConsensus(id)
log.Printf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
// threshold := 0
// switch {
// case min_depth < 0:
// spectrum := graph.WeightSpectrum()
// cum := make(map[int]int)
// spectrum[1] = 0
// for i := 2; i < len(spectrum); i++ {
// spectrum[i] += spectrum[i-1]
// cum[spectrum[i]]++
// }
// max := 0
// kmax := 0
// for k, obs := range cum {
// if obs > max {
// max = obs
// kmax = k
// }
// }
// for i, total := range spectrum {
// if total == kmax {
// threshold = i
// break
// }
// }
// threshold /= 2
// if threshold < 1 {
// threshold = 1
// }
// log.Info("Estimated kmer_min_occur = ", threshold)
// case min_depth >= 1:
// threshold = int(min_depth)
// default:
// threshold = int(float64(len(seqs)) * min_depth)
// }
// graph.FilterMinWeight(threshold)
// log.Printf("Graph size : %d\n", graph.Len())
// if save_graph {
// file, err := os.Create(path.Join(dirname,
// fmt.Sprintf("%s_consensus.gml", consensus_id)))
// if err != nil {
// fmt.Println(err)
// } else {
// file.WriteString(graph.Gml())
// file.Close()
// }
// }
seq, err := graph.LongestConsensus(consensus_id, max_length)
sumCount := 0
for _, s := range seqs {
sumCount += s.Count()
if seq != nil {
for _, s := range seqs {
sumCount += s.Count()
}
seq.SetCount(sumCount)
seq.SetAttribute("seq_length", seq.Len())
seq.SetAttribute("kmer_size", kmer_size)
//seq.SetAttribute("kmer_min_occur", threshold)
seq.SetAttribute("kmer_max_occur", graph.MaxWeight())
seq.SetAttribute("filtered_graph_size", graph.Len())
seq.SetAttribute("full_graph_size", total_kmer)
}
seq.SetCount(sumCount)
seq.SetAttribute("seq_length", seq.Len())
seq.SetAttribute("kmer_size", kmer_size)
seq.SetAttribute("kmer_min_occur", threshold)
seq.SetAttribute("kmer_max_occur", graph.MaxLink())
seq.SetAttribute("filtered_graph_size", graph.Len())
seq.SetAttribute("full_graph_size", total_kmer)
return seq, err
}
// func BuildConsensusWithTimeout(seqs obiseq.BioSequenceSlice,
// kmer_size int, quorum float64,
// min_depth float64,
// save_graph bool, dirname string, timeout time.Duration) (*obiseq.BioSequence, error) {
// ctx, cancel := context.WithTimeout(context.Background(), timeout)
// defer cancel()
// consensus := func() *obiseq.BioSequence {
// cons, err := BuildConsensus(seqs, kmer_size, quorum, min_depth, save_graph, dirname,)
// if err != nil {
// cons = nil
// }
// return cons
// }
// computation := func() <-chan *obiseq.BioSequence {
// result := make(chan *obiseq.BioSequence)
// go func() {
// select {
// case <-ctx.Done():
// result <- nil
// default:
// result <- consensus()
// }
// }()
// return result
// }
// calcResult := computation()
// select {
// case result := <-calcResult:
// if result == nil {
// return nil, fmt.Errorf("cannot compute consensus")
// }
// return result, nil
// case <-ctx.Done():
// return nil, fmt.Errorf("compute consensus timeout, exiting")
// }
// }
func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
newIter := obiiter.MakeIBioSequence()
size := 10
@@ -153,10 +258,19 @@ func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
for iterator.Next() {
seqs := iterator.Get()
consensus, err := BuildConsensus(seqs.Slice(),
sequences := seqs.Slice()
id := sequences[0].Source()
if id == "" {
id = sequences[0].Id()
}
consensus, err := BuildConsensus(sequences,
id,
CLIKmerSize(), CLIThreshold(),
CLIKmerDepth(),
CLISaveGraphToFiles(), CLIGraphFilesDirectory(),
CLIMaxConsensusLength(),
CLISaveGraphToFiles(),
CLIGraphFilesDirectory(),
)
if err == nil {

View File

@@ -9,6 +9,7 @@ var _saveGraph = "__@@NOSAVE@@__"
var _kmerSize = -1
var _threshold = 0.99
var _mindepth = -1.0
var _consensus_max_length = -1
func ObiconsensusOptionSet(options *getoptions.GetOpt) {
@@ -38,6 +39,12 @@ func ObiconsensusOptionSet(options *getoptions.GetOpt) {
"Default value = -1, which means that the DEPTH is estimated from the data"),
)
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
options.ArgName("LENGTH"),
options.Description("Maximum length of the consensus sequence. "+
"Default value = -1, which means that no limit is applied"),
)
}
func OptionSet(options *getoptions.GetOpt) {
@@ -67,3 +74,7 @@ func CLIKmerDepth() float64 {
func CLIThreshold() float64 {
return _threshold
}
func CLIMaxConsensusLength() int {
return _consensus_max_length
}

View File

@@ -0,0 +1,290 @@
package obiminion
import (
"fmt"
"os"
"sync"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obigraph"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiannotate"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiuniq"
"github.com/schollz/progressbar/v3"
log "github.com/sirupsen/logrus"
)
// SampleWeight calculates the weight of a sample based on the statistics of a sequence.
//
// Parameters:
// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice)
// - sample: the sample for which the weight is calculated (string)
// - sample_key: the key used to access the sample's statistics (string)
// Return type: a function that takes an integer index and returns the weight of the sample at that index (func(int) int)
func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func(int) float64 {
f := func(i int) float64 {
stats := (*seqs)[i].StatsOn(sample_key, "NA")
if value, ok := stats[sample]; ok {
return float64(value)
}
return 0
}
return f
}
// SeqBySamples sorts the sequences by samples.
//
// Parameters:
// - seqs: a pointer to BioSequenceSlice representing the sequences (*BioSequenceSlice)
// - sample_key: a string representing the sample key (string)
//
// Return type:
// - map[string]BioSequenceSlice: a map indexed by sample names, each containing a slice of BioSequence objects (map[string]BioSequenceSlice)
func SeqBySamples(seqs obiseq.BioSequenceSlice, sample_key string) map[string]*obiseq.BioSequenceSlice {
samples := make(map[string]*obiseq.BioSequenceSlice)
for _, s := range seqs {
if s.HasStatsOn(sample_key) {
stats := s.StatsOn(sample_key, "NA")
for k := range stats {
if seqset, ok := samples[k]; ok {
*seqset = append(*seqset, s)
samples[k] = seqset
} else {
samples[k] = &obiseq.BioSequenceSlice{s}
}
}
} else {
if k, ok := s.GetStringAttribute(sample_key); ok {
if seqset, ok := samples[k]; ok {
*seqset = append(*seqset, s)
samples[k] = seqset
} else {
samples[k] = &obiseq.BioSequenceSlice{s}
}
}
}
}
return samples
}
type Mutation struct {
Position int
SeqA byte
SeqB byte
Ratio float64
}
func BuildDiffSeqGraph(name, name_key string,
seqs *obiseq.BioSequenceSlice,
distmax, nworkers int) *obigraph.Graph[*obiseq.BioSequence, Mutation] {
graph := obigraph.NewGraphBuffer[*obiseq.BioSequence, Mutation](name, (*[]*obiseq.BioSequence)(seqs))
iseq := make(chan int)
defer graph.Close()
ls := len(*seqs)
sw := SampleWeight(seqs, name, name_key)
graph.Graph.VertexWeight = sw
waiting := sync.WaitGroup{}
waiting.Add(nworkers)
bar := (*progressbar.ProgressBar)(nil)
if obiconvert.CLIProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription(fmt.Sprintf("[Build graph] on %s", name)),
)
bar = progressbar.NewOptions(len(*seqs), pbopt...)
}
computeEdges := func() {
defer waiting.Done()
for i := range iseq {
s1 := (*seqs)[i]
for j := i + 1; j < ls; j++ {
s2 := (*seqs)[j]
ratio := sw(i) / sw(j)
ok, pos, a1, a2 := obialign.D1Or0(s1, s2)
if ok >= 0 {
graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio})
} else if distmax > 1 {
lcs, lali := obialign.FastLCSScore(s1, s2, distmax, nil)
dist := lali - lcs
if lcs > 0 && dist <= distmax {
// log.Infof("Seq %s and %s: LCSScore: %d, dist: %d\n", s1.Id(), s2.Id(), lcs, dist)
graph.AddEdge(i, j, &Mutation{pos, a1, a2, ratio})
}
}
}
if bar != nil {
bar.Add(1)
}
}
}
for i := 0; i < nworkers; i++ {
go computeEdges()
}
for i := 0; i < ls; i++ {
iseq <- i
}
close(iseq)
waiting.Wait()
return graph.Graph
}
func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
sample_key string, kmer_size int, max_length int, threshold float64, depth float64) obiseq.BioSequenceSlice {
denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices))
for i, v := range *graph.Vertices {
var err error
var clean *obiseq.BioSequence
degree := graph.Degree(i)
if degree > 4 {
pack := obiseq.MakeBioSequenceSlice(degree + 1)
for k,j := range graph.Neighbors(i) {
pack[k] = (*graph.Vertices)[j]
}
pack[degree] = v
clean, err = obiconsensus.BuildConsensus(pack,
fmt.Sprintf("%s_consensus", v.Id()),
kmer_size,
threshold,
depth, max_length,
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
if err != nil {
log.Warning(err)
clean = (*graph.Vertices)[i]
clean.SetAttribute("obiminion_consensus", false)
} else {
clean.SetAttribute("obiminion_consensus", true)
}
pack.Recycle(false)
} else {
clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition())
clean.SetAttribute("obiminion_consensus", false)
}
clean.SetCount(int(graph.VertexWeight(i)))
clean.SetAttribute(sample_key, graph.Name)
denoised[i] = clean
}
return denoised
}
func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
dirname := CLIGraphFilesDirectory()
newIter := obiiter.MakeIBioSequence()
db := itertator.Load()
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
samples := SeqBySamples(db, CLISampleAttribute())
db.Recycle(false)
log.Infof("Dataset composed of %d samples\n", len(samples))
if CLIMaxConsensusLength() > 0 {
log.Infof("Maximum consensus length: %d\n", CLIMaxConsensusLength())
}
log.Infof("Dataset composed of %d samples\n", len(samples))
if CLISaveGraphToFiles() {
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
// path does not exist or is not directory
os.RemoveAll(dirname)
err := os.Mkdir(dirname, 0755)
if err != nil {
log.Panicf("Cannot create directory %s for saving graphs", dirname)
}
}
}
bar := (*progressbar.ProgressBar)(nil)
if obiconvert.CLIProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Filter graph on abundance ratio]"),
)
bar = progressbar.NewOptions(len(samples), pbopt...)
}
newIter.Add(1)
go func() {
sample_order := 0
for sample, seqs := range samples {
graph := BuildDiffSeqGraph(sample,
CLISampleAttribute(),
seqs,
CLIDistStepMax(),
obioptions.CLIParallelWorkers())
if bar != nil {
bar.Add(1)
}
if CLISaveGraphToFiles() {
graph.WriteGmlFile(fmt.Sprintf("%s/%s.gml",
CLIGraphFilesDirectory(),
sample),
false, 1, 0, 3)
}
denoised := MinionDenoise(graph,
CLISampleAttribute(),
CLIKmerSize(),
CLIMaxConsensusLength(),
CLIThreshold(),
CLIKmerDepth())
newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised))
sample_order++
}
newIter.Done()
}()
go func() {
newIter.WaitAndClose()
}()
obiuniq.AddStatsOn(CLISampleAttribute())
obiuniq.SetUniqueInMemory(false)
obiuniq.SetNoSingleton(CLINoSingleton())
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
}

View File

@@ -0,0 +1,179 @@
package obiminion
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _distStepMax = 1
var _sampleAttribute = "sample"
var _ratioMax = 1.0
var _minEvalRate = 1000
var _clusterMode = false
var _onlyHead = false
var _kmerSize = -1
var _threshold = 1.0
var _mindepth = -1.0
var _consensus_max_length = 1000
var _NoSingleton = false
var _saveGraph = "__@@NOSAVE@@__"
var _saveRatio = "__@@NOSAVE@@__"
// ObiminionOptionSet sets the options for obiminion.
//
// options: The options for configuring obiminion.
func ObiminionOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
options.Alias("s"),
options.Description("Attribute containing sample descriptions (default %s)."))
options.IntVar(&_distStepMax, "distance", _distStepMax,
options.Alias("d"),
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate,
options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate."))
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
"The graph files follow the graphml format."),
)
options.StringVar(&_saveRatio, "save-ratio", _saveRatio,
options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+
"The ratio file follows the csv format."),
)
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.ArgName("SIZE"),
options.Description("The size of the kmer used to build the consensus. "+
"Default value = -1, which means that the kmer size is estimated from the data"),
)
options.Float64Var(&_threshold, "threshold", _threshold,
options.ArgName("RATIO"),
options.Description("A threshold between O and 1 used to determine the optimal "+
"kmer size"),
)
options.Float64Var(&_mindepth, "min-depth", _mindepth,
options.ArgName("DEPTH"),
options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+
"reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+
"than 1, indicate the minimum count of occurrence for a kmer to be kept. "+
"Default value = -1, which means that the DEPTH is estimated from the data"),
)
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
options.ArgName("LENGTH"),
options.Description("Maximum length of the consensus sequence. "+
"Default value = -1, which means that no limit is applied"),
)
options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton,
options.Description("If set, sequences occurring a single time in the data set are discarded."))
}
// OptionSet sets up the options for the obiminion package.
//
// It takes a pointer to a getoptions.GetOpt object as a parameter.
// It does not return any value.
func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options)
obiconvert.OutputOptionSet(options)
ObiminionOptionSet(options)
}
// CLIDistStepMax returns the maximum distance between two sequences.
//
// The value of the distance is set by the user with the `-d` flag.
//
// No parameters.
// Returns an integer.
func CLIDistStepMax() int {
return _distStepMax
}
// CLISampleAttribute returns the name of the attribute used to store sample name.
//
// The value of the sample attribute is set by the user with the `-s` flag.
//
// No parameters.
// Returns a string.
func CLISampleAttribute() string {
return _sampleAttribute
}
// > The function `CLIMinCountToEvalMutationRate()` returns the minimum number of reads that must be
// observed before the mutation rate can be evaluated
func CLIMinCountToEvalMutationRate() int {
return _minEvalRate
}
func ClusterMode() bool {
return _clusterMode
}
// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program
func OnlyHead() bool {
return _onlyHead
}
// Returns true it the obliclean graphs must be saved
func CLISaveGraphToFiles() bool {
return _saveGraph != "__@@NOSAVE@@__"
}
// It returns the directory where the graph files are saved
func CLIGraphFilesDirectory() string {
return _saveGraph
}
// Returns true it the table of ratio must be saved
func IsSaveRatioTable() bool {
return _saveRatio != "__@@NOSAVE@@__"
}
// It returns the filename of the file that stores the ratio table
func RatioTableFilename() string {
return _saveRatio
}
// CLIKmerSize returns the value of the kmer size to use for building the consensus.
//
// The value of the kmer size is set by the user with the `-k` flag.
// The value -1 means that the kmer size is estimated as the minimum value that
// insure that no kmer are present more than one time in a sequence.
//
// No parameters.
// Returns an integer value.
func CLIKmerSize() int {
return _kmerSize
}
func CLIKmerDepth() float64 {
return _mindepth
}
func CLIThreshold() float64 {
return _threshold
}
func CLIMaxConsensusLength() int {
return _consensus_max_length
}
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
//
// No parameters.
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
func CLINoSingleton() bool {
return _NoSingleton
}

View File

@@ -47,8 +47,8 @@ func CLIPCR(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error) {
frags := obiiter.IFragments(
CLIMaxLength()*1000,
CLIMaxLength()*100,
CLIMaxLength()+obiutils.MaxInt(len(CLIForwardPrimer()),
len(CLIReversePrimer()))+obiutils.MinInt(len(CLIForwardPrimer()),
CLIMaxLength()+obiutils.Max(len(CLIForwardPrimer()),
len(CLIReversePrimer()))+obiutils.Min(len(CLIForwardPrimer()),
len(CLIReversePrimer()))/2,
100,
obioptions.CLIParallelWorkers(),

View File

@@ -63,7 +63,7 @@ func IndexSequence(seqidx int,
if lca[order] == ancestor {
// nseq[i]++
if mini != -1 {
wordmin = obiutils.MaxInt(sequence.Len(), references[order].Len()) - 3 - 4*mini
wordmin = obiutils.Max(sequence.Len(), references[order].Len()) - 3 - 4*mini
}
if cw[order] < wordmin {
@@ -189,7 +189,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
indexed := obiiter.MakeIBioSequence()
go func() {
for i := 0; i < len(references); i += 10 {
limits <- [2]int{i, obiutils.MinInt(i+10, len(references))}
limits <- [2]int{i, obiutils.Min(i+10, len(references))}
}
close(limits)
}()

View File

@@ -110,7 +110,7 @@ func FindClosests(sequence *obiseq.BioSequence,
d, _, _, _ := obialign.D1Or0(sequence, references[order])
if d >= 0 {
score = d
alilength = obiutils.MaxInt(sequence.Len(), ref.Len())
alilength = obiutils.Max(sequence.Len(), ref.Len())
lcs = alilength - score
}
} else {

View File

@@ -12,6 +12,11 @@ var _chunks = 100
var _NAValue = "NA"
var _NoSingleton = false
// UniqueOptionSet sets up unique options for the obiuniq command.
//
// It configures various options such as merging attributes, category attributes,
// defining the NA value, handling singleton sequences, choosing between in-memory
// or disk storage, and specifying the chunk count for dataset division.
func UniqueOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_StatsOn, "merge",
1, 1,
@@ -40,25 +45,67 @@ func UniqueOptionSet(options *getoptions.GetOpt) {
}
// OptionSet adds to the basic option set every options declared for
// the obiuniq command
//
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
UniqueOptionSet(options)
}
// CLIStatsOn returns the list of variables on witch statistics are computed.
//
// It does not take any parameters.
// It returns a slice of strings representing the statistics on values.
func CLIStatsOn() []string {
return _StatsOn
}
// SetStatsOn sets the list of variables on witch statistics are computed.
//
// It takes a slice of strings as its parameter and does not return anything.
func SetStatsOn(statsOn []string) {
_StatsOn = statsOn
}
// AddStatsOn adds a variable to the list of variables on witch statistics are computed.
//
// Parameters:
// - statsOn: variadic strings representing the statistics to be added.
func AddStatsOn(statsOn ...string) {
_StatsOn = append(_StatsOn, statsOn...)
}
// CLIKeys returns the keys used to distinguished among identical sequences.
//
// It does not take any parameters.
// It returns a slice of strings representing the keys used by the CLI.
func CLIKeys() []string {
return _Keys
}
// CLIUniqueInMemory returns if the unique function is running in memory only.
//
// It does not take any parameters.
// It returns a boolean value indicating whether the function is running in memory or not.
func CLIUniqueInMemory() bool {
return _InMemory
}
// SetUniqueInMemory sets whether the unique function is running in memory or not.
//
// inMemory bool - A boolean value indicating whether the function is running in memory.
// No return value.
func SetUniqueInMemory(inMemory bool) {
_InMemory = inMemory
}
// CLINumberOfChunks returns the number of chunks used for the first bucket sort step used by the unique function.
//
// It does not take any parameters.
// It returns an integer representing the number of chunks.
func CLINumberOfChunks() int {
if _chunks <= 1 {
return 1
@@ -67,10 +114,40 @@ func CLINumberOfChunks() int {
return _chunks
}
// SetNumberOfChunks sets the number of chunks used for the first bucket sort step used by the unique function.
//
// chunks int - The number of chunks to be set.
// No return value.
func SetNumberOfChunks(chunks int) {
_chunks = chunks
}
// CLINAValue returns the value used as a placeholder for missing values.
//
// No parameters.
// Return type: string.
func CLINAValue() string {
return _NAValue
}
// SetNAValue sets the NA value to the specified string.
//
// value string - The value to set as the NA value.
func SetNAValue(value string) {
_NAValue = value
}
// CLINoSingleton returns a boolean value indicating whether or not singleton sequences should be discarded.
//
// No parameters.
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
func CLINoSingleton() bool {
return _NoSingleton
}
// SetNoSingleton sets the boolean value indicating whether or not singleton sequences should be discarded.
//
// noSingleton bool - The boolean value to set for _NoSingleton.
func SetNoSingleton(noSingleton bool) {
_NoSingleton = noSingleton
}

View File

@@ -8,7 +8,7 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
)
func Unique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
func CLIUnique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
options := make([]obichunk.WithOption, 0, 30)