add the --min-sample-count option to obiclean.

This commit is contained in:
Eric Coissac
2025-02-24 08:48:31 +01:00
parent 51d11aa36d
commit 8671285d02
9 changed files with 43 additions and 23 deletions

View File

@ -23,6 +23,10 @@
### New features
- `obiclean` add a new **--min-sample-count** option with a default value of 1,
asking to filter out sequences which are not occurring in at least the
specified number of samples.
- `obitoaxonomy` a new **--dump|D** option allows for dumping a sub-taxonomy.
- Taxonomy dump can now be provided as a four-columns CSV file to the **--taxonomy**

6
go.mod
View File

@ -5,7 +5,9 @@ go 1.23.1
require (
github.com/DavidGamba/go-getoptions v0.28.0
github.com/PaesslerAG/gval v1.2.2
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df
github.com/buger/jsonparser v1.1.1
github.com/chen3feng/stl4go v0.1.1
github.com/dlclark/regexp2 v1.11.4
github.com/goccy/go-json v0.10.3
@ -24,10 +26,6 @@ require (
)
require (
github.com/Clever/csvlint v0.3.0 // indirect
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9 // indirect
github.com/buger/jsonparser v1.1.1 // indirect
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
github.com/kr/pretty v0.3.0 // indirect

5
go.sum
View File

@ -1,5 +1,3 @@
github.com/Clever/csvlint v0.3.0 h1:58WEFXWy+i0fCbxTXscR2QwYESRuAUFjEGLgZs6j2iU=
github.com/Clever/csvlint v0.3.0/go.mod h1:+wLRuW/bI8NhpRoeyUBxqKsK35OhvgJhXHSWdKp5XJU=
github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c=
github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E=
@ -14,8 +12,6 @@ github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMU
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q=
github.com/chen3feng/stl4go v0.1.1/go.mod h1:5ml3psLgETJjRJnMbPE+JiHLrCpt+Ajc2weeTECXzWU=
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5OhCuC+XN+r/bBCmeuuJtjz+bCNIf8=
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
@ -73,7 +69,6 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=

View File

@ -21,15 +21,15 @@ func encodeValues(score, length int, out bool) uint64 {
return fo
}
func _isout(value uint64) bool {
const outmask = uint64(1) << dwsize
return (value & outmask) == 0
}
// func _isout(value uint64) bool {
// const outmask = uint64(1) << dwsize
// return (value & outmask) == 0
// }
func _lpath(value uint64) int {
const mask = uint64(1<<wsize) - 1
return int(((value + 1) ^ mask) & mask)
}
// func _lpath(value uint64) int {
// const mask = uint64(1<<wsize) - 1
// return int(((value + 1) ^ mask) & mask)
// }
func decodeValues(value uint64) (int, int, bool) {
const mask = uint64(1<<wsize) - 1
@ -57,4 +57,3 @@ func _setout(value uint64) uint64 {
var _empty = encodeValues(0, 0, false)
var _out = encodeValues(0, 30000, true)
var _notavail = encodeValues(0, 30000, false)

View File

@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "fb6f857"
var _Commit = "51d11aa"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -196,6 +196,16 @@ func IsShorterOrEqualTo(length int) SequencePredicate {
return f
}
func OccurInAtleast(sample string, n int) SequencePredicate {
desc := MakeStatsOnDescription(sample)
f := func(sequence *BioSequence) bool {
stats := sequence.StatsOn(desc, "NA")
return len(stats) >= n
}
return f
}
func IsSequenceMatch(pattern string) SequencePredicate {
pat, err := regexp.Compile("(?i)" + pattern)

View File

@ -368,7 +368,12 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
iter := annotateOBIClean(source, db, samples, SampleAttribute(), "NA")
if OnlyHead() {
iter = iter.FilterOn(IsHead, 1000)
iter = iter.FilterOn(IsHead, obidefault.BatchSize())
}
if MinSampleCount() > 1 {
sc := obiseq.OccurInAtleast(SampleAttribute(), MinSampleCount())
iter = iter.FilterOn(sc, obidefault.BatchSize())
}
return iter

View File

@ -16,6 +16,7 @@ var _onlyHead = false
var _saveGraph = "__@@NOSAVE@@__"
var _saveRatio = "__@@NOSAVE@@__"
var _minSample = 1
func ObicleanOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
@ -55,6 +56,9 @@ func ObicleanOptionSet(options *getoptions.GetOpt) {
"The ratio file follows the csv format."),
)
options.IntVar(&_minSample, "min-sample-count", _minSample,
options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
)
}
func OptionSet(options *getoptions.GetOpt) {
@ -111,3 +115,8 @@ func IsSaveRatioTable() bool {
func RatioTableFilename() string {
return _saveRatio
}
// It returns the minimum number of samples a sequence must be present in to be considered in the analysis
func MinSampleCount() int {
return _minSample
}

View File

@ -238,7 +238,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
log.Printf("End of the sequence Pairing")
}()
f := func(iterator obiiter.IBioSequence, wid int) {
f := func(iterator obiiter.IBioSequence) {
arena := obialign.MakePEAlignArena(150, 150)
shifts := make(map[int]int)
@ -263,9 +263,9 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
log.Printf("Start of the sequence Pairing using %d workers\n", nworkers)
for i := 0; i < nworkers-1; i++ {
go f(iterator.Split(), i)
go f(iterator.Split())
}
go f(iterator, nworkers-1)
go f(iterator)
return newIter
}