From cf5b4baa54eda0e340af9325c3149ccb421e2c01 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 30 May 2022 16:28:59 +0200 Subject: [PATCH] Small bug in sequence counting on merge --- pkg/obichunk/subchunks.go | 2 +- pkg/obichunk/unique.go | 7 ++++++- pkg/obiseq/merge.go | 3 ++- pkg/obitools/obiuniq/options.go | 2 +- pkg/obitools/obiuniq/unique.go | 11 +++++++++++ 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pkg/obichunk/subchunks.go b/pkg/obichunk/subchunks.go index dbce3d6..dc40edf 100644 --- a/pkg/obichunk/subchunks.go +++ b/pkg/obichunk/subchunks.go @@ -11,7 +11,7 @@ import ( ) // -// Interface for sorting a list of sequences accoording to +// Interface for sorting a list of sequences according to // their classes // diff --git a/pkg/obichunk/unique.go b/pkg/obichunk/unique.go index 33c3e65..ddfddb6 100644 --- a/pkg/obichunk/unique.go +++ b/pkg/obichunk/unique.go @@ -9,6 +9,9 @@ import ( "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) +// Runs dereplication algorithm on a obiiter.IBioSequenceBatch +// iterator. + func IUniqueSequence(iterator obiiter.IBioSequenceBatch, options ...WithOption) (obiiter.IBioSequenceBatch, error) { @@ -62,7 +65,9 @@ func IUniqueSequence(iterator obiiter.IBioSequenceBatch, return neworder } - var ff func(obiiter.IBioSequenceBatch, *obiseq.BioSequenceClassifier, int) + var ff func(obiiter.IBioSequenceBatch, + *obiseq.BioSequenceClassifier, + int) cat := opts.Categories() na := opts.NAValue() diff --git a/pkg/obiseq/merge.go b/pkg/obiseq/merge.go index 29bb581..7737ef3 100644 --- a/pkg/obiseq/merge.go +++ b/pkg/obiseq/merge.go @@ -37,6 +37,7 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { newstat = false case map[string]interface{}: stats = make(StatsOnValues, len(istat)) + newstat = false var err error for k, v := range istat { stats[k], err = goutils.InterfaceToInt(v) @@ -161,7 +162,7 @@ func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequenc seq.SetQualities(nil) if len(sequences) == 1 { - seq.Annotations()["count"] = 1 + seq.Annotations()["count"] = seq.Count() for _, v := range statsOn { seq.StatsOn(v, na) } diff --git a/pkg/obitools/obiuniq/options.go b/pkg/obitools/obiuniq/options.go index 7caa82b..2b64718 100644 --- a/pkg/obitools/obiuniq/options.go +++ b/pkg/obitools/obiuniq/options.go @@ -41,7 +41,7 @@ func UniqueOptionSet(options *getoptions.GetOpt) { } // OptionSet adds to the basic option set every options declared for -// the obipcr command +// the obiuniq command func OptionSet(options *getoptions.GetOpt) { obiconvert.OptionSet(options) UniqueOptionSet(options) diff --git a/pkg/obitools/obiuniq/unique.go b/pkg/obitools/obiuniq/unique.go index c3d6b77..356d9b7 100644 --- a/pkg/obitools/obiuniq/unique.go +++ b/pkg/obitools/obiuniq/unique.go @@ -16,6 +16,11 @@ func Unique(sequences obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { obichunk.OptionBatchCount(CLINumberOfChunks()), ) + // + // Considers if data splitting must be done on disk or in memory + // + // --on-disk command line option + if CLIUniqueInMemory() { log.Printf("Running dereplication in memory on %d chunks", CLINumberOfChunks()) options = append(options, obichunk.OptionSortOnMemory()) @@ -24,6 +29,12 @@ func Unique(sequences obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { options = append(options, obichunk.OptionSortOnDisk()) } + // + // Considers if sequences observed a singletime in the dataset have to + // be conserved in the output + // + // --no-singleton + if CLINoSingleton() { log.Printf("Removing sigletons from the output") options = append(options, obichunk.OptionsNoSingleton())