From 185b974d132c10da4b8df9e286abf5a5d888f8ff Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 7 Nov 2023 11:56:49 +0200 Subject: [PATCH] Adds a JSON output format Former-commit-id: 26f07460772c0f735bf705d473f892878d3e57f0 --- Release-notes.md | 13 +- pkg/obiformats/fastqseq_write_generic.go | 47 ++++++ pkg/obiformats/json_writer.go | 185 +++++++++++++++++++++ pkg/obitools/obiconvert/options.go | 10 +- pkg/obitools/obiconvert/sequence_writer.go | 4 + 5 files changed, 254 insertions(+), 5 deletions(-) create mode 100644 pkg/obiformats/fastqseq_write_generic.go create mode 100644 pkg/obiformats/json_writer.go diff --git a/Release-notes.md b/Release-notes.md index 6d87887..dc9d918 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -12,17 +12,24 @@ - A new file format guesser is now implemented. This is a first step towards allowing new formats to be managed by obitools. - New way of handling header definitions of fasta and fastq formats with JSON headers. The sequence definition is now printed in new files as an attribute of the json header named "definition". -- The -D (--delta) option has been added to `obipcr`. It allows to extract flanking sequences of the barcode. +- The -D (--delta) option has been added to `obipcr`. It allows extracting flanking sequences of the barcode. + If -D is not set, the output sequence is the barcode itself without the priming sites. + If -D is set to 0, the output sequence is the barcode with the priming sites. + When -D is set to ### (where ### is an integer), the output sequence is the barcode with the priming sites. and ### base pairs of flanking sequences. +- A new output format in JSON is proposed using the **--json-output**. The sequence file is printed as a JSON vector, + where each element is a map corresponding to a sequence. The map has at most for elements: + + *"id"* : which is the only mandatory element (string) + + *"sequence"* : if sequence data is present in the record (string) + + *"qualities"* : if quality data is associated to the record (string) + + *"annotations"* : annotations is associated to the record (a map of annotations). + ### Bugs -- in the obitools language, the `composition` function now returns a map indexded by lowercase string "a", "c", "g", "t" and "o" for other instead of being indexed by the ascii codes of the corresponding letters. +- in the obitools language, the `composition` function now returns a map indexed by lowercase string "a", "c", "g", "t" and "o" for other instead of being indexed by the ASCII codes of the corresponding letters. - Correction of the reverse-complement operation. Every reverse complement of the DNA sequence follow now the following rules : - + Nucleotides code are complemented to their lower complementary base + + Nucleotide codes are complemented to their lower complementary base + `.` and `-` characters are returned without change + `[` is complemented to `]` and oppositely + all other characters are complemented as `n` diff --git a/pkg/obiformats/fastqseq_write_generic.go b/pkg/obiformats/fastqseq_write_generic.go new file mode 100644 index 0000000..79f3f29 --- /dev/null +++ b/pkg/obiformats/fastqseq_write_generic.go @@ -0,0 +1,47 @@ +package obiformats + +import ( + "bytes" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" +) + +type BioSequenceBatchFormater func(batch obiiter.BioSequenceBatch) []byte +type BioSequenceFormater func(sequence *obiseq.BioSequence) string + +func BuildFastxSeqFormater(format string, header FormatHeader) BioSequenceFormater { + var f BioSequenceFormater + + switch format { + case "fastq": + f = func(sequence *obiseq.BioSequence) string { + return FormatFastq(sequence, header) + } + case "fasta": + f = func(sequence *obiseq.BioSequence) string { + return FormatFasta(sequence, header) + } + default: + log.Fatal("Unknown output format") + } + + return f +} + +func BuildFastxFormater(format string, header FormatHeader) BioSequenceBatchFormater { + fs := BuildFastxSeqFormater(format, header) + + f := func(batch obiiter.BioSequenceBatch) []byte { + var bs bytes.Buffer + for _, seq := range batch.Slice() { + bs.WriteString(fs(seq)) + bs.WriteString("\n") + } + return bs.Bytes() + } + + return f +} diff --git a/pkg/obiformats/json_writer.go b/pkg/obiformats/json_writer.go new file mode 100644 index 0000000..0d4193b --- /dev/null +++ b/pkg/obiformats/json_writer.go @@ -0,0 +1,185 @@ +package obiformats + +import ( + "bufio" + "bytes" + "encoding/json" + "io" + "os" + "sync" + "time" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" + log "github.com/sirupsen/logrus" +) + +func JSONRecord(sequence *obiseq.BioSequence) []byte { + record := make(map[string]interface{}, 4) + + record["id"] = sequence.Id() + + if sequence.HasSequence() { + record["sequence"] = sequence.String() + } + + if sequence.HasQualities() { + record["qualities"] = sequence.QualitiesString() + } + + if sequence.HasAnnotation() { + record["annotations"] = sequence.Annotations() + } + + text, error := json.MarshalIndent(record, " ", " ") + + if error != nil { + log.Panicf("conversion to JSON error on sequence id %s", sequence.Id()) + } + + return text +} + +func FormatJSONBatch(batch obiiter.BioSequenceBatch) []byte { + buff := new(bytes.Buffer) + json := bufio.NewWriter(buff) + n := batch.Slice().Len() - 1 + for i, s := range batch.Slice() { + json.WriteString(" ") + json.Write(JSONRecord(s)) + if i < n { + json.WriteString(",\n") + } + } + + json.Flush() + + return buff.Bytes() +} + +func WriteJSON(iterator obiiter.IBioSequence, + file io.WriteCloser, + options ...WithOption) (obiiter.IBioSequence, error) { + + opt := MakeOptions(options) + + file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile()) + + newIter := obiiter.MakeIBioSequence() + + nwriters := opt.ParallelWorkers() + + obiiter.RegisterAPipe() + chunkchan := make(chan FileChunck) + + newIter.Add(nwriters) + var waitWriter sync.WaitGroup + + go func() { + newIter.WaitAndClose() + for len(chunkchan) > 0 { + time.Sleep(time.Millisecond) + } + close(chunkchan) + waitWriter.Wait() + }() + + ff := func(iterator obiiter.IBioSequence) { + for iterator.Next() { + + batch := iterator.Get() + + chunkchan <- FileChunck{ + FormatJSONBatch(batch), + batch.Order(), + } + newIter.Push(batch) + } + newIter.Done() + } + + next_to_send := 0 + received := make(map[int]FileChunck, 100) + + waitWriter.Add(1) + go func() { + for chunk := range chunkchan { + if chunk.order == next_to_send { + if next_to_send > 0 { + file.Write([]byte(",\n")) + } + file.Write(chunk.text) + next_to_send++ + chunk, ok := received[next_to_send] + for ok { + file.Write(chunk.text) + delete(received, next_to_send) + next_to_send++ + chunk, ok = received[next_to_send] + } + } else { + received[chunk.order] = chunk + } + + } + + file.Write([]byte("\n]\n")) + file.Close() + + log.Debugln("End of the JSON file writing") + obiiter.UnregisterPipe() + waitWriter.Done() + + }() + + log.Debugln("Start of the JSON file writing") + file.Write([]byte("[\n")) + go ff(iterator) + for i := 0; i < nwriters-1; i++ { + go ff(iterator.Split()) + } + + return newIter, nil +} + +func WriteJSONToStdout(iterator obiiter.IBioSequence, + options ...WithOption) (obiiter.IBioSequence, error) { + options = append(options, OptionDontCloseFile()) + return WriteJSON(iterator, os.Stdout, options...) +} + +func WriteJSONToFile(iterator obiiter.IBioSequence, + filename string, + options ...WithOption) (obiiter.IBioSequence, error) { + + opt := MakeOptions(options) + flags := os.O_WRONLY | os.O_CREATE + + if opt.AppendFile() { + flags |= os.O_APPEND + } + file, err := os.OpenFile(filename, flags, 0660) + + if err != nil { + log.Fatalf("open file error: %v", err) + return obiiter.NilIBioSequence, err + } + + options = append(options, OptionCloseFile()) + + iterator, err = WriteJSON(iterator, file, options...) + + if opt.HaveToSavePaired() { + var revfile *os.File + + revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660) + if err != nil { + log.Fatalf("open file error: %v", err) + return obiiter.NilIBioSequence, err + } + iterator, err = WriteJSON(iterator.PairedWith(), revfile, options...) + } + + return iterator, err +} diff --git a/pkg/obitools/obiconvert/options.go b/pkg/obitools/obiconvert/options.go index 91e5065..255e208 100644 --- a/pkg/obitools/obiconvert/options.go +++ b/pkg/obitools/obiconvert/options.go @@ -18,6 +18,7 @@ var __input_genbank_format__ = false var __output_in_fasta__ = false var __output_in_fastq__ = false +var __output_in_json__ = false var __output_fastjson_format__ = false var __output_fastobi_format__ = false @@ -77,10 +78,13 @@ func OutputModeOptionSet(options *getoptions.GetOpt) { func OutputOptionSet(options *getoptions.GetOpt) { options.BoolVar(&__output_in_fasta__, "fasta-output", false, - options.Description("Read data following the ecoPCR output format.")) + options.Description("Write sequence in fasta format (default if no quality data available).")) options.BoolVar(&__output_in_fastq__, "fastq-output", false, - options.Description("Read data following the EMBL flatfile format.")) + options.Description("Write sequence in fastq format (default if quality data available).")) + + options.BoolVar(&__output_in_json__, "json-output", false, + options.Description("Write sequence in json format.")) options.BoolVar(&__output_fastjson_format__, "output-json-header", false, options.Description("output FASTA/FASTQ title line annotations follow json format.")) @@ -130,6 +134,8 @@ func CLIOutputFormat() string { return "fastq" case __output_in_fasta__: return "fasta" + case __output_in_json__: + return "json" default: return "guessed" } diff --git a/pkg/obitools/obiconvert/sequence_writer.go b/pkg/obitools/obiconvert/sequence_writer.go index 1e1e7ef..5cdaa5b 100644 --- a/pkg/obitools/obiconvert/sequence_writer.go +++ b/pkg/obitools/obiconvert/sequence_writer.go @@ -87,6 +87,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence, newIter, err = obiformats.WriteFastqToFile(iterator, fn, opts...) case "fasta": newIter, err = obiformats.WriteFastaToFile(iterator, fn, opts...) + case "json": + newIter, err = obiformats.WriteJSONToFile(iterator, fn, opts...) default: newIter, err = obiformats.WriteSequencesToFile(iterator, fn, opts...) } @@ -97,6 +99,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence, newIter, err = obiformats.WriteFastqToStdout(iterator, opts...) case "fasta": newIter, err = obiformats.WriteFastaToStdout(iterator, opts...) + case "json": + newIter, err = obiformats.WriteJSONToStdout(iterator, opts...) default: newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...) }