Adds a JSON output format

Former-commit-id: 26f07460772c0f735bf705d473f892878d3e57f0
This commit is contained in:
2023-11-07 11:56:49 +02:00
parent 61c30f9b6a
commit 185b974d13
5 changed files with 254 additions and 5 deletions

View File

@ -0,0 +1,47 @@
package obiformats
import (
"bytes"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type BioSequenceBatchFormater func(batch obiiter.BioSequenceBatch) []byte
type BioSequenceFormater func(sequence *obiseq.BioSequence) string
func BuildFastxSeqFormater(format string, header FormatHeader) BioSequenceFormater {
var f BioSequenceFormater
switch format {
case "fastq":
f = func(sequence *obiseq.BioSequence) string {
return FormatFastq(sequence, header)
}
case "fasta":
f = func(sequence *obiseq.BioSequence) string {
return FormatFasta(sequence, header)
}
default:
log.Fatal("Unknown output format")
}
return f
}
func BuildFastxFormater(format string, header FormatHeader) BioSequenceBatchFormater {
fs := BuildFastxSeqFormater(format, header)
f := func(batch obiiter.BioSequenceBatch) []byte {
var bs bytes.Buffer
for _, seq := range batch.Slice() {
bs.WriteString(fs(seq))
bs.WriteString("\n")
}
return bs.Bytes()
}
return f
}

View File

@ -0,0 +1,185 @@
package obiformats
import (
"bufio"
"bytes"
"encoding/json"
"io"
"os"
"sync"
"time"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
func JSONRecord(sequence *obiseq.BioSequence) []byte {
record := make(map[string]interface{}, 4)
record["id"] = sequence.Id()
if sequence.HasSequence() {
record["sequence"] = sequence.String()
}
if sequence.HasQualities() {
record["qualities"] = sequence.QualitiesString()
}
if sequence.HasAnnotation() {
record["annotations"] = sequence.Annotations()
}
text, error := json.MarshalIndent(record, " ", " ")
if error != nil {
log.Panicf("conversion to JSON error on sequence id %s", sequence.Id())
}
return text
}
func FormatJSONBatch(batch obiiter.BioSequenceBatch) []byte {
buff := new(bytes.Buffer)
json := bufio.NewWriter(buff)
n := batch.Slice().Len() - 1
for i, s := range batch.Slice() {
json.WriteString(" ")
json.Write(JSONRecord(s))
if i < n {
json.WriteString(",\n")
}
}
json.Flush()
return buff.Bytes()
}
func WriteJSON(iterator obiiter.IBioSequence,
file io.WriteCloser,
options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options)
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
newIter := obiiter.MakeIBioSequence()
nwriters := opt.ParallelWorkers()
obiiter.RegisterAPipe()
chunkchan := make(chan FileChunck)
newIter.Add(nwriters)
var waitWriter sync.WaitGroup
go func() {
newIter.WaitAndClose()
for len(chunkchan) > 0 {
time.Sleep(time.Millisecond)
}
close(chunkchan)
waitWriter.Wait()
}()
ff := func(iterator obiiter.IBioSequence) {
for iterator.Next() {
batch := iterator.Get()
chunkchan <- FileChunck{
FormatJSONBatch(batch),
batch.Order(),
}
newIter.Push(batch)
}
newIter.Done()
}
next_to_send := 0
received := make(map[int]FileChunck, 100)
waitWriter.Add(1)
go func() {
for chunk := range chunkchan {
if chunk.order == next_to_send {
if next_to_send > 0 {
file.Write([]byte(",\n"))
}
file.Write(chunk.text)
next_to_send++
chunk, ok := received[next_to_send]
for ok {
file.Write(chunk.text)
delete(received, next_to_send)
next_to_send++
chunk, ok = received[next_to_send]
}
} else {
received[chunk.order] = chunk
}
}
file.Write([]byte("\n]\n"))
file.Close()
log.Debugln("End of the JSON file writing")
obiiter.UnregisterPipe()
waitWriter.Done()
}()
log.Debugln("Start of the JSON file writing")
file.Write([]byte("[\n"))
go ff(iterator)
for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split())
}
return newIter, nil
}
func WriteJSONToStdout(iterator obiiter.IBioSequence,
options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionDontCloseFile())
return WriteJSON(iterator, os.Stdout, options...)
}
func WriteJSONToFile(iterator obiiter.IBioSequence,
filename string,
options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options)
flags := os.O_WRONLY | os.O_CREATE
if opt.AppendFile() {
flags |= os.O_APPEND
}
file, err := os.OpenFile(filename, flags, 0660)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
options = append(options, OptionCloseFile())
iterator, err = WriteJSON(iterator, file, options...)
if opt.HaveToSavePaired() {
var revfile *os.File
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
iterator, err = WriteJSON(iterator.PairedWith(), revfile, options...)
}
return iterator, err
}

View File

@ -18,6 +18,7 @@ var __input_genbank_format__ = false
var __output_in_fasta__ = false
var __output_in_fastq__ = false
var __output_in_json__ = false
var __output_fastjson_format__ = false
var __output_fastobi_format__ = false
@ -77,10 +78,13 @@ func OutputModeOptionSet(options *getoptions.GetOpt) {
func OutputOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__output_in_fasta__, "fasta-output", false,
options.Description("Read data following the ecoPCR output format."))
options.Description("Write sequence in fasta format (default if no quality data available)."))
options.BoolVar(&__output_in_fastq__, "fastq-output", false,
options.Description("Read data following the EMBL flatfile format."))
options.Description("Write sequence in fastq format (default if quality data available)."))
options.BoolVar(&__output_in_json__, "json-output", false,
options.Description("Write sequence in json format."))
options.BoolVar(&__output_fastjson_format__, "output-json-header", false,
options.Description("output FASTA/FASTQ title line annotations follow json format."))
@ -130,6 +134,8 @@ func CLIOutputFormat() string {
return "fastq"
case __output_in_fasta__:
return "fasta"
case __output_in_json__:
return "json"
default:
return "guessed"
}

View File

@ -87,6 +87,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
newIter, err = obiformats.WriteFastqToFile(iterator, fn, opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToFile(iterator, fn, opts...)
case "json":
newIter, err = obiformats.WriteJSONToFile(iterator, fn, opts...)
default:
newIter, err = obiformats.WriteSequencesToFile(iterator, fn, opts...)
}
@ -97,6 +99,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
newIter, err = obiformats.WriteFastqToStdout(iterator, opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToStdout(iterator, opts...)
case "json":
newIter, err = obiformats.WriteJSONToStdout(iterator, opts...)
default:
newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...)
}