Refactoring codes for removing buffer size options. An some other changes...

Former-commit-id: 10b57cc1a27446ade3c444217341e9651e89cdce
This commit is contained in:
2023-03-07 11:12:13 +07:00
parent 9811e440b8
commit d88de15cdc
52 changed files with 1172 additions and 421 deletions

View File

@@ -0,0 +1,63 @@
package obicleandb
import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obichunk"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep"
)
func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
var rankPredicate obiseq.SequencePredicate
options := make([]obichunk.WithOption, 0, 30)
// Make sequence dereplication with a constraint on the taxid.
// To be merged, both sequences must have the same taxid.
options = append(options,
obichunk.OptionBatchCount(100),
obichunk.OptionSortOnMemory(),
obichunk.OptionSubCategory("taxid"),
obichunk.OptionsParallelWorkers(
obioptions.CLIParallelWorkers()),
obichunk.OptionsBatchSize(
obioptions.CLIBatchSize()),
obichunk.OptionNAValue("NA"),
)
unique, err := obichunk.IUniqueSequence(itertator, options...)
if err != nil {
log.Fatal(err)
}
taxonomy := obigrep.CLILoadSelectedTaxonomy()
if len(obigrep.CLIRequiredRanks()) > 0 {
rankPredicate = obigrep.CLIHasRankDefinedPredicate()
} else {
rankPredicate = taxonomy.HasRequiredRank("species").And(taxonomy.HasRequiredRank("genus")).And(taxonomy.HasRequiredRank("family"))
}
goodTaxa := taxonomy.IsAValidTaxon(CLIUpdateTaxids()).And(rankPredicate)
usable := unique.FilterOn(goodTaxa,
obioptions.CLIBatchSize(),
obioptions.CLIParallelWorkers())
annotated := usable.MakeIWorker(taxonomy.MakeSetSpeciesWorker(),
obioptions.CLIParallelWorkers(),
).MakeIWorker(taxonomy.MakeSetGenusWorker(),
obioptions.CLIParallelWorkers(),
).MakeIWorker(taxonomy.MakeSetFamilyWorker(),
obioptions.CLIParallelWorkers(),
)
// annotated.MakeIConditionalWorker(obiseq.IsMoreAbundantOrEqualTo(3),1000)
return annotated
}

View File

@@ -60,6 +60,21 @@ func InputOptionSet(options *getoptions.GetOpt) {
}
func OutputModeOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
options.Description("Disable the progress bar printing"))
options.BoolVar(&__compressed__, "compress", false,
options.Alias("Z"),
options.Description("Output is compressed"))
options.StringVar(&__output_file_name__, "out", __output_file_name__,
options.Alias("o"),
options.ArgName("FILENAME"),
options.Description("Filename used for saving the output"),
)
}
func OutputOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__output_in_fasta__, "fasta-output", false,
options.Description("Read data following the ecoPCR output format."))
@@ -73,19 +88,7 @@ func OutputOptionSet(options *getoptions.GetOpt) {
options.Alias("O"),
options.Description("output FASTA/FASTQ title line annotations follow OBI format."))
options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
options.Description("Disable the progress bar printing"))
options.BoolVar(&__compressed__, "compress", false,
options.Alias("Z"),
options.Description("Output is compressed"))
options.StringVar(&__output_file_name__, "out", __output_file_name__,
options.Alias("o"),
options.ArgName("FILENAME"),
options.Description("Filename used for saving the output"),
)
OutputModeOptionSet(options)
}
func PairedFilesOptionSet(options *getoptions.GetOpt) {
@@ -197,4 +200,4 @@ func CLIHasPairedFile() bool {
}
func CLIPairedFileName() string {
return __paired_file_name__
}
}

View File

@@ -48,6 +48,10 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
strings.HasSuffix(path, "fasta.gz") ||
strings.HasSuffix(path, "fastq") ||
strings.HasSuffix(path, "fastq.gz") ||
strings.HasSuffix(path, "seq") ||
strings.HasSuffix(path, "seq.gz") ||
strings.HasSuffix(path, "gb") ||
strings.HasSuffix(path, "gb.gz") ||
strings.HasSuffix(path, "dat") ||
strings.HasSuffix(path, "dat.gz") ||
strings.HasSuffix(path, "ecopcr") ||
@@ -82,13 +86,12 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseGuessedFastSeqHeader))
}
nworkers := obioptions.CLIParallelWorkers() // / 4
nworkers := obioptions.CLIParallelWorkers()
if nworkers < 2 {
nworkers = 2
}
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBufferSize(obioptions.CLIBufferSize()))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(CLIInputQualityShift()))

View File

@@ -60,7 +60,6 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
}
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBufferSize(obioptions.CLIBufferSize()))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(CLIOutputQualityShift()))

View File

@@ -0,0 +1,61 @@
package obicsv
import (
"log"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
)
func CLIWriteCSV(iterator obiiter.IBioSequence,
terminalAction bool, filenames ...string) (obiiter.IBioSequence, error) {
if obiconvert.CLIProgressBar() {
iterator = iterator.Speed()
}
var newIter obiiter.IBioSequence
opts := make([]obiformats.WithOption, 0, 10)
nworkers := obioptions.CLIParallelWorkers() / 4
if nworkers < 2 {
nworkers = 2
}
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()))
opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed()))
opts = append(opts, obiformats.CSVId(CLIPrintId()),
obiformats.CSVCount(CLIPrintCount()),
obiformats.CSVTaxon(CLIPrintTaxon()),
obiformats.CSVDefinition(CLIPrintDefinition()),
obiformats.CSVKeys(CLIToBeKeptAttributes()),
)
var err error
if len(filenames) == 0 {
newIter, err = obiformats.WriteCSVToStdout(iterator, opts...)
} else {
newIter, err = obiformats.WriteCSVToFile(iterator, filenames[0], opts...)
}
if err != nil {
log.Fatalf("Write file error: %v", err)
return obiiter.NilIBioSequence, err
}
if terminalAction {
newIter.Recycle()
return obiiter.NilIBioSequence, nil
}
return newIter, nil
}

View File

@@ -0,0 +1,126 @@
package obicsv
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _outputIds = true
var _outputCount = false
var _outputTaxon = false
var _outputSequence = true
var _outputQuality = true
var _outputDefinition = false
var _obipairing = false
var _autoColumns = false
var _keepOnly = make([]string, 0)
var _naValue = "NA"
var _softAttributes = map[string][]string{
"obipairing": {"mode", "seq_a_single", "seq_b_single",
"ali_dir", "score", "score_norm",
"seq_ab_match", "pairing_mismatches",
},
}
func CSVOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_outputIds, "ids", _outputIds,
options.Alias("i"),
options.Description("Prints sequence ids in the ouput."))
options.BoolVar(&_outputSequence, "sequence", _outputSequence,
options.Alias("s"),
options.Description("Prints sequence itself in the output."))
options.BoolVar(&_outputQuality, "quality", _outputQuality,
options.Alias("q"),
options.Description("Prints sequence quality in the output."))
options.BoolVar(&_outputDefinition, "definition", _outputDefinition,
options.Alias("d"),
options.Description("Prints sequence definition in the output."))
options.BoolVar(&_autoColumns, "auto", _autoColumns,
options.Description("Based on the first sequences, propose a list of attibutes to print"))
options.BoolVar(&_outputCount, "count", _outputCount,
options.Description("Prints the count attribute in the output"))
options.BoolVar(&_outputTaxon, "taxon", _outputTaxon,
options.Description("Prints the NCBI taxid and its related scientific name"))
options.BoolVar(&_obipairing, "obipairing", _obipairing,
options.Description("Prints the attributes added by obipairing"))
options.StringSliceVar(&_keepOnly, "keep", 1, 1,
options.Alias("k"),
options.ArgName("KEY"),
options.Description("Keeps only attribute with key <KEY>. Several -k options can be combined."))
options.StringVar(&_naValue, "na-value", _naValue,
options.ArgName("NAVALUE"),
options.Description("A string representing non available values in the CSV file."))
}
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OutputModeOptionSet(options)
CSVOptionSet(options)
}
func CLIPrintId() bool {
return _outputIds
}
func CLIPrintSequence() bool {
return _outputSequence
}
func CLIPrintCount() bool {
return _outputCount
}
func CLIPrintTaxon() bool {
return _outputTaxon
}
func CLIPrintQuality() bool {
return _outputQuality
}
func CLIPrintDefinition() bool {
return _outputDefinition
}
func CLIAutoColumns() bool {
return _autoColumns
}
func CLIHasToBeKeptAttributes() bool {
return len(_keepOnly) > 0
}
func CLIToBeKeptAttributes() []string {
if _obipairing {
_keepOnly = append(_keepOnly, _softAttributes["obipairing"]...)
}
if i := goutils.LookFor(_keepOnly, "count"); i >= 0 {
_keepOnly = goutils.RemoveIndex(_keepOnly, i)
_outputCount = true
}
if i := goutils.LookFor(_keepOnly, "taxid"); i >= 0 {
_keepOnly = goutils.RemoveIndex(_keepOnly, i)
_outputTaxon = true
}
if i := goutils.LookFor(_keepOnly, "scientific_name"); i >= 0 {
_keepOnly = goutils.RemoveIndex(_keepOnly, i)
_outputTaxon = true
}
return _keepOnly
}
func CLINAValue() string {
return _naValue
}

View File

@@ -31,7 +31,6 @@ func DistributeSequence(sequences obiiter.IBioSequence) {
}
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers),
obiformats.OptionsBufferSize(obioptions.CLIBufferSize()),
obiformats.OptionsBatchSize(obioptions.CLIBatchSize()),
obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()),
obiformats.OptionsAppendFile(CLIAppendSequences()),

View File

@@ -39,7 +39,6 @@ func CLIFilterSequence(iterator obiiter.IBioSequence) obiiter.IBioSequence {
newIter = iterator.FilterOn(predicate,
obioptions.CLIBatchSize(),
obioptions.CLIParallelWorkers(),
obioptions.CLIBufferSize(),
)
}
} else {

View File

@@ -20,7 +20,6 @@ func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error
obingslibrary.OptionDiscardErrors(!CLIConservedErrors()),
obingslibrary.OptionParallelWorkers(obioptions.CLIParallelWorkers()),
obingslibrary.OptionBatchSize(obioptions.CLIBatchSize()),
obingslibrary.OptionBufferSize(obioptions.CLIBufferSize()),
)
ngsfilter, err := CLINGSFIlter()

View File

@@ -211,17 +211,13 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
}
nworkers := obioptions.CLIMaxCPU() * 3 / 2
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
nworkers = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
newIter := obiiter.MakeIBioSequence(buffsize)
newIter := obiiter.MakeIBioSequence()
newIter.Add(nworkers)

View File

@@ -51,8 +51,6 @@ func Unique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
options = append(options,
obichunk.OptionsParallelWorkers(
obioptions.CLIParallelWorkers()),
obichunk.OptionsBufferSize(
obioptions.CLIBufferSize()),
obichunk.OptionsBatchSize(
obioptions.CLIBatchSize()),
obichunk.OptionNAValue(CLINAValue()),