add the --min-sample-count option to obiclean.

This commit is contained in:
Eric Coissac
2025-02-24 08:48:31 +01:00
parent 51d11aa36d
commit 8671285d02
9 changed files with 43 additions and 23 deletions

View File

@ -21,15 +21,15 @@ func encodeValues(score, length int, out bool) uint64 {
return fo
}
func _isout(value uint64) bool {
const outmask = uint64(1) << dwsize
return (value & outmask) == 0
}
// func _isout(value uint64) bool {
// const outmask = uint64(1) << dwsize
// return (value & outmask) == 0
// }
func _lpath(value uint64) int {
const mask = uint64(1<<wsize) - 1
return int(((value + 1) ^ mask) & mask)
}
// func _lpath(value uint64) int {
// const mask = uint64(1<<wsize) - 1
// return int(((value + 1) ^ mask) & mask)
// }
func decodeValues(value uint64) (int, int, bool) {
const mask = uint64(1<<wsize) - 1
@ -57,4 +57,3 @@ func _setout(value uint64) uint64 {
var _empty = encodeValues(0, 0, false)
var _out = encodeValues(0, 30000, true)
var _notavail = encodeValues(0, 30000, false)

View File

@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "fb6f857"
var _Commit = "51d11aa"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -196,6 +196,16 @@ func IsShorterOrEqualTo(length int) SequencePredicate {
return f
}
func OccurInAtleast(sample string, n int) SequencePredicate {
desc := MakeStatsOnDescription(sample)
f := func(sequence *BioSequence) bool {
stats := sequence.StatsOn(desc, "NA")
return len(stats) >= n
}
return f
}
func IsSequenceMatch(pattern string) SequencePredicate {
pat, err := regexp.Compile("(?i)" + pattern)

View File

@ -368,7 +368,12 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
iter := annotateOBIClean(source, db, samples, SampleAttribute(), "NA")
if OnlyHead() {
iter = iter.FilterOn(IsHead, 1000)
iter = iter.FilterOn(IsHead, obidefault.BatchSize())
}
if MinSampleCount() > 1 {
sc := obiseq.OccurInAtleast(SampleAttribute(), MinSampleCount())
iter = iter.FilterOn(sc, obidefault.BatchSize())
}
return iter

View File

@ -16,6 +16,7 @@ var _onlyHead = false
var _saveGraph = "__@@NOSAVE@@__"
var _saveRatio = "__@@NOSAVE@@__"
var _minSample = 1
func ObicleanOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
@ -55,6 +56,9 @@ func ObicleanOptionSet(options *getoptions.GetOpt) {
"The ratio file follows the csv format."),
)
options.IntVar(&_minSample, "min-sample-count", _minSample,
options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
)
}
func OptionSet(options *getoptions.GetOpt) {
@ -111,3 +115,8 @@ func IsSaveRatioTable() bool {
func RatioTableFilename() string {
return _saveRatio
}
// It returns the minimum number of samples a sequence must be present in to be considered in the analysis
func MinSampleCount() int {
return _minSample
}

View File

@ -238,7 +238,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
log.Printf("End of the sequence Pairing")
}()
f := func(iterator obiiter.IBioSequence, wid int) {
f := func(iterator obiiter.IBioSequence) {
arena := obialign.MakePEAlignArena(150, 150)
shifts := make(map[int]int)
@ -263,9 +263,9 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
log.Printf("Start of the sequence Pairing using %d workers\n", nworkers)
for i := 0; i < nworkers-1; i++ {
go f(iterator.Split(), i)
go f(iterator.Split())
}
go f(iterator, nworkers-1)
go f(iterator)
return newIter
}