some renaming of functions

2026-02-02 22:30:34 +00:00 · 2025-08-06 15:54:50 -04:00
parent 997b6e8c01
commit 04f3af3e60
8 changed files with 174 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@

 !/obitests/**
 !/sample/**
+LLM/** 
--- a/pkg/obichunk/chunk.go
+++ b/pkg/obichunk/chunk.go
@@ -0,0 +1,17 @@
+package obichunk
+
+import (
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+)
+
+func ISequenceChunk(iterator obiiter.IBioSequence,
+	classifier *obiseq.BioSequenceClassifier,
+	onMemory bool) (obiiter.IBioSequence, error) {
+
+	if onMemory {
+		return ISequenceChunkOnMemory(iterator, classifier)
+	} else {
+		return ISequenceChunkOnDisk(iterator, classifier)
+	}
+}
--- a/pkg/obichunk/chunks_on_memory.go
+++ b/pkg/obichunk/chunks_on_memory.go
@@ -9,7 +9,7 @@ import (
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )

-// ISequenceChunkMemory processes a sequence iterator by distributing the sequences
+// ISequenceChunkOnMemory processes a sequence iterator by distributing the sequences
 // into chunks in memory. It uses a classifier to determine how to distribute
 // the sequences and returns a new iterator for the processed sequences.
 //
@@ -22,7 +22,7 @@ import (
 // An iterator of biosequences representing the processed chunks.
 //
 // The function operates asynchronously.
-func ISequenceChunkMemory(iterator obiiter.IBioSequence,
+func ISequenceChunkOnMemory(iterator obiiter.IBioSequence,
 	classifier *obiseq.BioSequenceClassifier) (obiiter.IBioSequence, error) {

 	newIter := obiiter.MakeIBioSequence()
--- a/pkg/obichunk/unique.go
+++ b/pkg/obichunk/unique.go
@@ -35,7 +35,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
 		}

 	} else {
-		iterator, err = ISequenceChunkMemory(iterator,
+		iterator, err = ISequenceChunkOnMemory(iterator,
 			obiseq.HashClassifier(opts.BatchCount()))

 		if err != nil {
--- a/pkg/obioptions/version.go
+++ b/pkg/obioptions/version.go
@@ -8,7 +8,7 @@ import (
 // corresponds to the last commit, and not the one when the file will be
 // commited

-var _Commit = "43b2855"
+var _Commit = "f239e8d"
 var _Version = "Release 4.4.0"

 // Version returns the version of the obitools package.
--- a/pkg/obitools/obiclust/obiclust.go
+++ b/pkg/obitools/obiclust/obiclust.go
@@ -0,0 +1 @@
+package obiclust
--- a/pkg/obitools/obiclust/options.go
+++ b/pkg/obitools/obiclust/options.go
@@ -0,0 +1,150 @@
+package obiclust
+
+import (
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+	"github.com/DavidGamba/go-getoptions"
+)
+
+type ScoreNormalizationMode int
+
+const (
+	NoNormalization ScoreNormalizationMode = iota
+	NormalizedByShortest
+	NormalizedByLongest
+	NormalizedByAlignment
+)
+
+var _sampleAttribute = "sample"
+var _threshold = 3.0
+var _shortReference = false
+var _longReference = false
+var _alignReference = true
+var _normalizedScore = false
+var _distanceMode = false
+var _exactMode = false
+var _ratioMax = 1.0
+var _sorted_by_length = false
+var _sorted_by_abundance = true
+var _sortingAscending = false
+var _onlyHead = false
+var _saveGraph = "__@@NOSAVE@@__"
+var _saveRatio = "__@@NOSAVE@@__"
+var _minSample = 1
+
+func ObiclustOptionSet(options *getoptions.GetOpt) {
+	options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
+		options.Alias("s"),
+		options.Description("Attribute containing sample descriptions (default %s)."))
+
+	options.Float64Var(&_threshold, "distance", _threshold,
+		options.Alias("--threshold"),
+		options.Description("Threshold to merge sequences into the same cluster (default: %d)."))
+
+	options.Float64Var(&_ratioMax, "ratio", _ratioMax,
+		options.Alias("r"),
+		options.Description("Threshold ratio between counts (rare/abundant counts)"+
+			" of two sequence records so that the less abundant one is a variant of "+
+			"the more abundant (default: %3.2f)."))
+
+	options.BoolVar(&_onlyHead, "head", _onlyHead,
+		options.Alias("H"),
+		options.Description("Select only sequences with the head status in at least one sample."),
+	)
+
+	options.StringVar(&_saveGraph, "save-graph", _saveGraph,
+		options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
+			"The graph files follow the graphml format."),
+	)
+
+	options.StringVar(&_saveRatio, "save-ratio", _saveRatio,
+		options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+
+			"The ratio file follows the csv format."),
+	)
+
+	options.IntVar(&_minSample, "min-sample-count", _minSample,
+		options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
+	)
+
+	options.BoolVar(&_normalizedScore, "normalized-score", _normalizedScore,
+		options.Alias("n"),
+		options.Description("Use alignment score normalized by length"),
+	)
+
+	options.BoolVar(&_shortReference, "shortest", _shortReference,
+		options.Description("Use length of the shortest sequence to normalize alignment score."),
+	)
+
+	options.BoolVar(&_longReference, "longest", _longReference,
+		options.Description("Use length of the longest sequence to normalize alignment score."),
+	)
+
+	options.BoolVar(&_alignReference, "alignment", _alignReference,
+		options.Description("Use alignment length to normalize alignment score."),
+	)
+
+	options.BoolVar(&_distanceMode, "distance", _distanceMode,
+		options.Description("Use alignment distance instead of similarity score."),
+	)
+
+	options.BoolVar(&_exactMode, "exact", _exactMode,
+		options.Description("Use exact clustering algorithm (default is greedy)."),
+	)
+
+	options.BoolVar(&_sorted_by_length, "length-ordered", _sorted_by_length,
+		options.Description("Sort sequence by length before clustering."),
+	)
+	options.BoolVar(&_sorted_by_abundance, "abundance-ordered", _sorted_by_abundance,
+		options.Description("Sort sequence by read counts (abundance) before clustering."),
+	)
+	options.BoolVar(&_sortingAscending, "ascending-sorting", _sortingAscending,
+		options.Description("Sort order is ascending (default is descending)."),
+	)
+}
+
+func OptionSet(options *getoptions.GetOpt) {
+	obiconvert.InputOptionSet(options)
+	obiconvert.OutputOptionSet(options)
+	ObiclustOptionSet(options)
+}
+
+// It returns the name of the attibute used to store sample name
+func CLISampleAttribute() string {
+	return _sampleAttribute
+}
+
+func CLIThreshold() float64 {
+	return _threshold
+}
+
+// Returns true it the obliclean graphs must be saved
+func CLISaveGraphToFiles() bool {
+	return _saveGraph != "__@@NOSAVE@@__"
+}
+
+// It returns the directory where the graph files are saved
+func CLIGraphFilesDirectory() string {
+	return _saveGraph
+}
+
+// Returns true it the table of ratio must be saved
+func CLIIsSaveRatioTable() bool {
+	return _saveRatio != "__@@NOSAVE@@__"
+}
+
+// It returns the filename of the file that stores the ratio table
+func CLIRatioTableFilename() string {
+	return _saveRatio
+}
+
+func CLINormalizationMode() ScoreNormalizationMode {
+	switch {
+	case _alignReference:
+		return NormalizedByAlignment
+	case _longReference:
+		return NormalizedByLongest
+	case _shortReference:
+		return NormalizedByShortest
+	default:
+		return NoNormalization
+	}
+}
--- a/pkg/obitools/obirefidx/famlilyindexing.go
+++ b/pkg/obitools/obirefidx/famlilyindexing.go
@@ -166,7 +166,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
 		obidefault.ParallelWorkers(),
 	)

-	family_iterator, err := obichunk.ISequenceChunkMemory(
+	family_iterator, err := obichunk.ISequenceChunkOnMemory(
 		partof,
 		obiseq.AnnotationClassifier("family_taxid", "NA"),
 	)