From be47ec909c66ab62565412e3a5e7bda351bd9818 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 17 Jul 2023 14:24:02 +0200 Subject: [PATCH] add the --skip-empty option Former-commit-id: ec9cb0ecaf90a61bf9289cf4c089b5cc2fcb65a5 --- Release-notes.md | 8 +++++++- cmd/obitools/obiclean/main.go | 5 +++-- go.mod | 1 + go.sum | 2 ++ pkg/obiformats/fastseq_write_fasta.go | 16 ++++++++++++---- pkg/obiformats/fastseq_write_fastq.go | 17 +++++++++++++---- pkg/obiformats/options.go | 14 ++++++++++++++ pkg/obitools/obiclean/obiclean.go | 2 +- pkg/obitools/obiconvert/options.go | 8 ++++++++ pkg/obitools/obiconvert/sequence_writer.go | 4 ++++ 10 files changed, 65 insertions(+), 12 deletions(-) diff --git a/Release-notes.md b/Release-notes.md index fd33d93..42059b2 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -4,7 +4,9 @@ ### Bugs -- Patch a bug in the install script for correctly follow download redirection. +- Patch a bug in the install-script for correctly follow download redirection. +- Patch a bug in `obitagpcr` to consider the renaming of the `forward_mismatch` and `reverse_mismatch` tags + to `forward_error` and `reverse_error`. ### Enhancement @@ -14,6 +16,10 @@ ### New feature +- In every *OBITools*, writing an empty sequence (sequence of length equal to zero) through an error and + stops the execution of the tool, except if the **--skip-empty** option is set. In that case, the empty + sequence is ignored and not printed to the output. When output involved paired sequence the **--skip-empty** + option is ignored. - In `obiannotate` adds the **--set-identifier** option to edit the sequence identifier - In `obitag` adds the **--save-db** option allowing at the end of the run of `obitag` to save a modified version of the reference database containing the computed index. This allows next diff --git a/cmd/obitools/obiclean/main.go b/cmd/obitools/obiclean/main.go index f012c7f..7b012fe 100644 --- a/cmd/obitools/obiclean/main.go +++ b/cmd/obitools/obiclean/main.go @@ -2,6 +2,7 @@ package main import ( "os" + log "github.com/sirupsen/logrus" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" @@ -19,11 +20,11 @@ func main() { fs, err := obiconvert.CLIReadBioSequences(args...) if err != nil { - log.Errorf("Cannot open file (%v)",err) + log.Errorf("Cannot open file (%v)", err) os.Exit(1) } - cleaned := obiclean.IOBIClean(fs) + cleaned := obiclean.CLIOBIClean(fs) obiconvert.CLIWriteBioSequences(cleaned, true) diff --git a/go.mod b/go.mod index e19c93d..6f10778 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( ) require ( + github.com/chen3feng/stl4go v0.1.1 // indirect github.com/klauspost/compress v1.16.3 // indirect github.com/mattn/go-runewidth v0.0.14 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect diff --git a/go.sum b/go.sum index 459755e..3dba408 100644 --- a/go.sum +++ b/go.sum @@ -6,6 +6,8 @@ github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM= +github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q= +github.com/chen3feng/stl4go v0.1.1/go.mod h1:5ml3psLgETJjRJnMbPE+JiHLrCpt+Ajc2weeTECXzWU= github.com/daichi-m/go18ds v1.12.1 h1:Pjc3IApmN4qtDiovGP9MvMpIzgZle3SHUcNaA5j46bg= github.com/daichi-m/go18ds v1.12.1/go.mod h1:wc2dURUr8aMxxC4Mn5ObJGVM7uIKU8JagY4nhtonXq8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= diff --git a/pkg/obiformats/fastseq_write_fasta.go b/pkg/obiformats/fastseq_write_fasta.go index 20952a5..0d050cf 100644 --- a/pkg/obiformats/fastseq_write_fasta.go +++ b/pkg/obiformats/fastseq_write_fasta.go @@ -55,11 +55,19 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string { folded) } -func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader) []byte { +func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) []byte { var bs bytes.Buffer for _, seq := range batch.Slice() { - bs.WriteString(FormatFasta(seq, formater)) - bs.WriteString("\n") + if seq.Len() > 0 { + bs.WriteString(FormatFasta(seq, formater)) + bs.WriteString("\n") + } else { + if skipEmpty { + log.Warnf("Sequence %s is empty and skiped in output",seq.Id()) + } else { + log.Fatalf("Sequence %s is empty",seq.Id()) + } + } } return bs.Bytes() } @@ -99,7 +107,7 @@ func WriteFasta(iterator obiiter.IBioSequence, batch := iterator.Get() chunkchan <- FileChunck{ - FormatFastaBatch(batch, header_format), + FormatFastaBatch(batch, header_format, opt.SkipEmptySequence()), batch.Order(), } newIter.Push(batch) diff --git a/pkg/obiformats/fastseq_write_fastq.go b/pkg/obiformats/fastseq_write_fastq.go index 7825482..1d2c695 100644 --- a/pkg/obiformats/fastseq_write_fastq.go +++ b/pkg/obiformats/fastseq_write_fastq.go @@ -39,11 +39,20 @@ func FormatFastq(seq *obiseq.BioSequence, quality_shift int, formater FormatHead } func FormatFastqBatch(batch obiiter.BioSequenceBatch, quality_shift int, - formater FormatHeader) []byte { + formater FormatHeader, skipEmpty bool) []byte { var bs bytes.Buffer for _, seq := range batch.Slice() { - bs.WriteString(FormatFastq(seq, quality_shift, formater)) - bs.WriteString("\n") + if seq.Len() > 0 { + bs.WriteString(FormatFastq(seq, quality_shift, formater)) + bs.WriteString("\n") + } else { + if skipEmpty { + log.Warnf("Sequence %s is empty and skiped in output", seq.Id()) + } else { + log.Fatalf("Sequence %s is empty", seq.Id()) + } + } + } return bs.Bytes() } @@ -90,7 +99,7 @@ func WriteFastq(iterator obiiter.IBioSequence, for iterator.Next() { batch := iterator.Get() chunk := FileChunck{ - FormatFastqBatch(batch, quality, header_format), + FormatFastqBatch(batch, quality, header_format, opt.SkipEmptySequence()), batch.Order(), } chunkchan <- chunk diff --git a/pkg/obiformats/options.go b/pkg/obiformats/options.go index 43c3f92..6b1d72e 100644 --- a/pkg/obiformats/options.go +++ b/pkg/obiformats/options.go @@ -16,6 +16,7 @@ type __options__ struct { closefile bool appendfile bool compressed bool + skip_empty bool csv_id bool csv_sequence bool csv_quality bool @@ -48,6 +49,7 @@ func MakeOptions(setters []WithOption) Options { closefile: false, appendfile: false, compressed: false, + skip_empty: false, csv_id: true, csv_definition: false, csv_count: false, @@ -110,6 +112,10 @@ func (opt Options) CompressedFile() bool { return opt.pointer.compressed } +func (opt Options) SkipEmptySequence() bool { + return opt.pointer.skip_empty +} + func (opt Options) CSVId() bool { return opt.pointer.csv_id } @@ -194,6 +200,14 @@ func OptionsCompressed(compressed bool) WithOption { return f } +func OptionsSkipEmptySequence(skip bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.skip_empty = skip + }) + + return f +} + func OptionsNewFile() WithOption { f := WithOption(func(opt Options) { opt.pointer.appendfile = false diff --git a/pkg/obitools/obiclean/obiclean.go b/pkg/obitools/obiclean/obiclean.go index 3e589b8..65be6b3 100644 --- a/pkg/obitools/obiclean/obiclean.go +++ b/pkg/obitools/obiclean/obiclean.go @@ -285,7 +285,7 @@ func Weight(sequence *obiseq.BioSequence) map[string]int { return weight } -func IOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence { +func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence { db := itertator.Load() diff --git a/pkg/obitools/obiconvert/options.go b/pkg/obitools/obiconvert/options.go index eca2294..13463be 100644 --- a/pkg/obitools/obiconvert/options.go +++ b/pkg/obitools/obiconvert/options.go @@ -26,6 +26,7 @@ var __output_solexa_quality__ = false var __no_progress_bar__ = false var __compressed__ = false +var __skip_empty__ = false var __output_file_name__ = "-" var __paired_file_name__ = "" @@ -70,6 +71,9 @@ func OutputModeOptionSet(options *getoptions.GetOpt) { options.Alias("Z"), options.Description("Output is compressed")) + options.BoolVar(&__skip_empty__, "skip-empty", __skip_empty__, + options.Description("Sequences of length equal to zero are suppressed from the output")) + options.StringVar(&__output_file_name__, "out", __output_file_name__, options.Alias("o"), options.ArgName("FILENAME"), @@ -141,6 +145,10 @@ func CLICompressed() bool { return __compressed__ } +func CLISkipEmpty() bool { + return __skip_empty__ +} + func CLIInputFastHeaderFormat() string { switch { case __input_fastjson_format__: diff --git a/pkg/obitools/obiconvert/sequence_writer.go b/pkg/obitools/obiconvert/sequence_writer.go index e91e5a8..0756656 100644 --- a/pkg/obitools/obiconvert/sequence_writer.go +++ b/pkg/obitools/obiconvert/sequence_writer.go @@ -80,6 +80,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence, var reverse string fn, reverse = BuildPairedFileNames(fn) opts = append(opts, obiformats.WritePairedReadsTo(reverse)) + } else { + opts = append(opts, obiformats.OptionsSkipEmptySequence(CLISkipEmpty())) } switch CLIOutputFormat() { @@ -91,6 +93,7 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence, newIter, err = obiformats.WriteSequencesToFile(iterator, fn, opts...) } } else { + opts = append(opts, obiformats.OptionsSkipEmptySequence(CLISkipEmpty())) switch CLIOutputFormat() { case "fastq": newIter, err = obiformats.WriteFastqToStdout(iterator, opts...) @@ -99,6 +102,7 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence, default: newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...) } + } if err != nil {