From 8671285d0264aa8f14afd850bf202762bb9ccf86 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 24 Feb 2025 08:48:31 +0100 Subject: [PATCH] add the --min-sample-count option to obiclean. --- Release-notes.md | 4 ++++ go.mod | 6 ++---- go.sum | 5 ----- pkg/obialign/fastlcs.go | 17 ++++++++--------- pkg/obioptions/version.go | 2 +- pkg/obiseq/predicate.go | 10 ++++++++++ pkg/obitools/obiclean/obiclean.go | 7 ++++++- pkg/obitools/obiclean/options.go | 9 +++++++++ pkg/obitools/obipairing/pairing.go | 6 +++--- 9 files changed, 43 insertions(+), 23 deletions(-) diff --git a/Release-notes.md b/Release-notes.md index 99f480d..28e1745 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -23,6 +23,10 @@ ### New features +- `obiclean` add a new **--min-sample-count** option with a default value of 1, + asking to filter out sequences which are not occurring in at least the + specified number of samples. + - `obitoaxonomy` a new **--dump|D** option allows for dumping a sub-taxonomy. - Taxonomy dump can now be provided as a four-columns CSV file to the **--taxonomy** diff --git a/go.mod b/go.mod index afcb2e0..650e73c 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,9 @@ go 1.23.1 require ( github.com/DavidGamba/go-getoptions v0.28.0 github.com/PaesslerAG/gval v1.2.2 + github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9 github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df + github.com/buger/jsonparser v1.1.1 github.com/chen3feng/stl4go v0.1.1 github.com/dlclark/regexp2 v1.11.4 github.com/goccy/go-json v0.10.3 @@ -24,10 +26,6 @@ require ( ) require ( - github.com/Clever/csvlint v0.3.0 // indirect - github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9 // indirect - github.com/buger/jsonparser v1.1.1 // indirect - github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect github.com/kr/pretty v0.3.0 // indirect diff --git a/go.sum b/go.sum index 9bac5f4..7ba5245 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,3 @@ -github.com/Clever/csvlint v0.3.0 h1:58WEFXWy+i0fCbxTXscR2QwYESRuAUFjEGLgZs6j2iU= -github.com/Clever/csvlint v0.3.0/go.mod h1:+wLRuW/bI8NhpRoeyUBxqKsK35OhvgJhXHSWdKp5XJU= github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c= github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84= github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E= @@ -14,8 +12,6 @@ github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMU github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q= github.com/chen3feng/stl4go v0.1.1/go.mod h1:5ml3psLgETJjRJnMbPE+JiHLrCpt+Ajc2weeTECXzWU= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5OhCuC+XN+r/bBCmeuuJtjz+bCNIf8= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -73,7 +69,6 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= diff --git a/pkg/obialign/fastlcs.go b/pkg/obialign/fastlcs.go index 075d4a0..6ca7967 100644 --- a/pkg/obialign/fastlcs.go +++ b/pkg/obialign/fastlcs.go @@ -21,15 +21,15 @@ func encodeValues(score, length int, out bool) uint64 { return fo } -func _isout(value uint64) bool { - const outmask = uint64(1) << dwsize - return (value & outmask) == 0 -} +// func _isout(value uint64) bool { +// const outmask = uint64(1) << dwsize +// return (value & outmask) == 0 +// } -func _lpath(value uint64) int { - const mask = uint64(1<= n + } + + return f +} + func IsSequenceMatch(pattern string) SequencePredicate { pat, err := regexp.Compile("(?i)" + pattern) diff --git a/pkg/obitools/obiclean/obiclean.go b/pkg/obitools/obiclean/obiclean.go index aa9641e..3a4fada 100644 --- a/pkg/obitools/obiclean/obiclean.go +++ b/pkg/obitools/obiclean/obiclean.go @@ -368,7 +368,12 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence { iter := annotateOBIClean(source, db, samples, SampleAttribute(), "NA") if OnlyHead() { - iter = iter.FilterOn(IsHead, 1000) + iter = iter.FilterOn(IsHead, obidefault.BatchSize()) + } + + if MinSampleCount() > 1 { + sc := obiseq.OccurInAtleast(SampleAttribute(), MinSampleCount()) + iter = iter.FilterOn(sc, obidefault.BatchSize()) } return iter diff --git a/pkg/obitools/obiclean/options.go b/pkg/obitools/obiclean/options.go index 1bf5952..d87aa6c 100644 --- a/pkg/obitools/obiclean/options.go +++ b/pkg/obitools/obiclean/options.go @@ -16,6 +16,7 @@ var _onlyHead = false var _saveGraph = "__@@NOSAVE@@__" var _saveRatio = "__@@NOSAVE@@__" +var _minSample = 1 func ObicleanOptionSet(options *getoptions.GetOpt) { options.StringVar(&_sampleAttribute, "sample", _sampleAttribute, @@ -55,6 +56,9 @@ func ObicleanOptionSet(options *getoptions.GetOpt) { "The ratio file follows the csv format."), ) + options.IntVar(&_minSample, "min-sample-count", _minSample, + options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."), + ) } func OptionSet(options *getoptions.GetOpt) { @@ -111,3 +115,8 @@ func IsSaveRatioTable() bool { func RatioTableFilename() string { return _saveRatio } + +// It returns the minimum number of samples a sequence must be present in to be considered in the analysis +func MinSampleCount() int { + return _minSample +} diff --git a/pkg/obitools/obipairing/pairing.go b/pkg/obitools/obipairing/pairing.go index 45fc809..991a635 100644 --- a/pkg/obitools/obipairing/pairing.go +++ b/pkg/obitools/obipairing/pairing.go @@ -238,7 +238,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence, log.Printf("End of the sequence Pairing") }() - f := func(iterator obiiter.IBioSequence, wid int) { + f := func(iterator obiiter.IBioSequence) { arena := obialign.MakePEAlignArena(150, 150) shifts := make(map[int]int) @@ -263,9 +263,9 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence, log.Printf("Start of the sequence Pairing using %d workers\n", nworkers) for i := 0; i < nworkers-1; i++ { - go f(iterator.Split(), i) + go f(iterator.Split()) } - go f(iterator, nworkers-1) + go f(iterator) return newIter }