From 04f3af3e60d6914b8f80735c997a1b6c5920cdc1 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 6 Aug 2025 15:54:50 -0400 Subject: [PATCH] some renaming of functions --- .gitignore | 1 + pkg/obichunk/chunk.go | 17 +++ pkg/obichunk/chunks_on_memory.go | 4 +- pkg/obichunk/unique.go | 2 +- pkg/obioptions/version.go | 2 +- pkg/obitools/obiclust/obiclust.go | 1 + pkg/obitools/obiclust/options.go | 150 ++++++++++++++++++++++ pkg/obitools/obirefidx/famlilyindexing.go | 2 +- 8 files changed, 174 insertions(+), 5 deletions(-) create mode 100644 pkg/obichunk/chunk.go create mode 100644 pkg/obitools/obiclust/obiclust.go create mode 100644 pkg/obitools/obiclust/options.go diff --git a/.gitignore b/.gitignore index dcbc022..b7622a8 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ !/obitests/** !/sample/** +LLM/** diff --git a/pkg/obichunk/chunk.go b/pkg/obichunk/chunk.go new file mode 100644 index 0000000..240ed44 --- /dev/null +++ b/pkg/obichunk/chunk.go @@ -0,0 +1,17 @@ +package obichunk + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +func ISequenceChunk(iterator obiiter.IBioSequence, + classifier *obiseq.BioSequenceClassifier, + onMemory bool) (obiiter.IBioSequence, error) { + + if onMemory { + return ISequenceChunkOnMemory(iterator, classifier) + } else { + return ISequenceChunkOnDisk(iterator, classifier) + } +} diff --git a/pkg/obichunk/chunks_on_memory.go b/pkg/obichunk/chunks_on_memory.go index 6a423ec..d00c59b 100644 --- a/pkg/obichunk/chunks_on_memory.go +++ b/pkg/obichunk/chunks_on_memory.go @@ -9,7 +9,7 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" ) -// ISequenceChunkMemory processes a sequence iterator by distributing the sequences +// ISequenceChunkOnMemory processes a sequence iterator by distributing the sequences // into chunks in memory. It uses a classifier to determine how to distribute // the sequences and returns a new iterator for the processed sequences. // @@ -22,7 +22,7 @@ import ( // An iterator of biosequences representing the processed chunks. // // The function operates asynchronously. -func ISequenceChunkMemory(iterator obiiter.IBioSequence, +func ISequenceChunkOnMemory(iterator obiiter.IBioSequence, classifier *obiseq.BioSequenceClassifier) (obiiter.IBioSequence, error) { newIter := obiiter.MakeIBioSequence() diff --git a/pkg/obichunk/unique.go b/pkg/obichunk/unique.go index 8b3127d..9e98bf0 100644 --- a/pkg/obichunk/unique.go +++ b/pkg/obichunk/unique.go @@ -35,7 +35,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence, } } else { - iterator, err = ISequenceChunkMemory(iterator, + iterator, err = ISequenceChunkOnMemory(iterator, obiseq.HashClassifier(opts.BatchCount())) if err != nil { diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 605ac73..56470e9 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "43b2855" +var _Commit = "f239e8d" var _Version = "Release 4.4.0" // Version returns the version of the obitools package. diff --git a/pkg/obitools/obiclust/obiclust.go b/pkg/obitools/obiclust/obiclust.go new file mode 100644 index 0000000..5ffa951 --- /dev/null +++ b/pkg/obitools/obiclust/obiclust.go @@ -0,0 +1 @@ +package obiclust diff --git a/pkg/obitools/obiclust/options.go b/pkg/obitools/obiclust/options.go new file mode 100644 index 0000000..ea4f6d0 --- /dev/null +++ b/pkg/obitools/obiclust/options.go @@ -0,0 +1,150 @@ +package obiclust + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +type ScoreNormalizationMode int + +const ( + NoNormalization ScoreNormalizationMode = iota + NormalizedByShortest + NormalizedByLongest + NormalizedByAlignment +) + +var _sampleAttribute = "sample" +var _threshold = 3.0 +var _shortReference = false +var _longReference = false +var _alignReference = true +var _normalizedScore = false +var _distanceMode = false +var _exactMode = false +var _ratioMax = 1.0 +var _sorted_by_length = false +var _sorted_by_abundance = true +var _sortingAscending = false +var _onlyHead = false +var _saveGraph = "__@@NOSAVE@@__" +var _saveRatio = "__@@NOSAVE@@__" +var _minSample = 1 + +func ObiclustOptionSet(options *getoptions.GetOpt) { + options.StringVar(&_sampleAttribute, "sample", _sampleAttribute, + options.Alias("s"), + options.Description("Attribute containing sample descriptions (default %s).")) + + options.Float64Var(&_threshold, "distance", _threshold, + options.Alias("--threshold"), + options.Description("Threshold to merge sequences into the same cluster (default: %d).")) + + options.Float64Var(&_ratioMax, "ratio", _ratioMax, + options.Alias("r"), + options.Description("Threshold ratio between counts (rare/abundant counts)"+ + " of two sequence records so that the less abundant one is a variant of "+ + "the more abundant (default: %3.2f).")) + + options.BoolVar(&_onlyHead, "head", _onlyHead, + options.Alias("H"), + options.Description("Select only sequences with the head status in at least one sample."), + ) + + options.StringVar(&_saveGraph, "save-graph", _saveGraph, + options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+ + "The graph files follow the graphml format."), + ) + + options.StringVar(&_saveRatio, "save-ratio", _saveRatio, + options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+ + "The ratio file follows the csv format."), + ) + + options.IntVar(&_minSample, "min-sample-count", _minSample, + options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."), + ) + + options.BoolVar(&_normalizedScore, "normalized-score", _normalizedScore, + options.Alias("n"), + options.Description("Use alignment score normalized by length"), + ) + + options.BoolVar(&_shortReference, "shortest", _shortReference, + options.Description("Use length of the shortest sequence to normalize alignment score."), + ) + + options.BoolVar(&_longReference, "longest", _longReference, + options.Description("Use length of the longest sequence to normalize alignment score."), + ) + + options.BoolVar(&_alignReference, "alignment", _alignReference, + options.Description("Use alignment length to normalize alignment score."), + ) + + options.BoolVar(&_distanceMode, "distance", _distanceMode, + options.Description("Use alignment distance instead of similarity score."), + ) + + options.BoolVar(&_exactMode, "exact", _exactMode, + options.Description("Use exact clustering algorithm (default is greedy)."), + ) + + options.BoolVar(&_sorted_by_length, "length-ordered", _sorted_by_length, + options.Description("Sort sequence by length before clustering."), + ) + options.BoolVar(&_sorted_by_abundance, "abundance-ordered", _sorted_by_abundance, + options.Description("Sort sequence by read counts (abundance) before clustering."), + ) + options.BoolVar(&_sortingAscending, "ascending-sorting", _sortingAscending, + options.Description("Sort order is ascending (default is descending)."), + ) +} + +func OptionSet(options *getoptions.GetOpt) { + obiconvert.InputOptionSet(options) + obiconvert.OutputOptionSet(options) + ObiclustOptionSet(options) +} + +// It returns the name of the attibute used to store sample name +func CLISampleAttribute() string { + return _sampleAttribute +} + +func CLIThreshold() float64 { + return _threshold +} + +// Returns true it the obliclean graphs must be saved +func CLISaveGraphToFiles() bool { + return _saveGraph != "__@@NOSAVE@@__" +} + +// It returns the directory where the graph files are saved +func CLIGraphFilesDirectory() string { + return _saveGraph +} + +// Returns true it the table of ratio must be saved +func CLIIsSaveRatioTable() bool { + return _saveRatio != "__@@NOSAVE@@__" +} + +// It returns the filename of the file that stores the ratio table +func CLIRatioTableFilename() string { + return _saveRatio +} + +func CLINormalizationMode() ScoreNormalizationMode { + switch { + case _alignReference: + return NormalizedByAlignment + case _longReference: + return NormalizedByLongest + case _shortReference: + return NormalizedByShortest + default: + return NoNormalization + } +} diff --git a/pkg/obitools/obirefidx/famlilyindexing.go b/pkg/obitools/obirefidx/famlilyindexing.go index e445c19..c862948 100644 --- a/pkg/obitools/obirefidx/famlilyindexing.go +++ b/pkg/obitools/obirefidx/famlilyindexing.go @@ -166,7 +166,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { obidefault.ParallelWorkers(), ) - family_iterator, err := obichunk.ISequenceChunkMemory( + family_iterator, err := obichunk.ISequenceChunkOnMemory( partof, obiseq.AnnotationClassifier("family_taxid", "NA"), )