Adds a JSON output format

Former-commit-id: 26f07460772c0f735bf705d473f892878d3e57f0
This commit is contained in:
2023-11-07 11:56:49 +02:00
parent 61c30f9b6a
commit 185b974d13
5 changed files with 254 additions and 5 deletions

View File

@ -12,17 +12,24 @@
- A new file format guesser is now implemented. This is a first step towards allowing new formats to be managed by obitools.
- New way of handling header definitions of fasta and fastq formats with JSON headers.
The sequence definition is now printed in new files as an attribute of the json header named "definition".
- The -D (--delta) option has been added to `obipcr`. It allows to extract flanking sequences of the barcode.
- The -D (--delta) option has been added to `obipcr`. It allows extracting flanking sequences of the barcode.
+ If -D is not set, the output sequence is the barcode itself without the priming sites.
+ If -D is set to 0, the output sequence is the barcode with the priming sites.
+ When -D is set to ### (where ### is an integer), the output sequence is the barcode with the priming sites.
and ### base pairs of flanking sequences.
- A new output format in JSON is proposed using the **--json-output**. The sequence file is printed as a JSON vector,
where each element is a map corresponding to a sequence. The map has at most for elements:
+ *"id"* : which is the only mandatory element (string)
+ *"sequence"* : if sequence data is present in the record (string)
+ *"qualities"* : if quality data is associated to the record (string)
+ *"annotations"* : annotations is associated to the record (a map of annotations).
### Bugs
- in the obitools language, the `composition` function now returns a map indexded by lowercase string "a", "c", "g", "t" and "o" for other instead of being indexed by the ascii codes of the corresponding letters.
- in the obitools language, the `composition` function now returns a map indexed by lowercase string "a", "c", "g", "t" and "o" for other instead of being indexed by the ASCII codes of the corresponding letters.
- Correction of the reverse-complement operation. Every reverse complement of the DNA sequence follow now the following rules :
+ Nucleotides code are complemented to their lower complementary base
+ Nucleotide codes are complemented to their lower complementary base
+ `.` and `-` characters are returned without change
+ `[` is complemented to `]` and oppositely
+ all other characters are complemented as `n`

View File

@ -0,0 +1,47 @@
package obiformats
import (
"bytes"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type BioSequenceBatchFormater func(batch obiiter.BioSequenceBatch) []byte
type BioSequenceFormater func(sequence *obiseq.BioSequence) string
func BuildFastxSeqFormater(format string, header FormatHeader) BioSequenceFormater {
var f BioSequenceFormater
switch format {
case "fastq":
f = func(sequence *obiseq.BioSequence) string {
return FormatFastq(sequence, header)
}
case "fasta":
f = func(sequence *obiseq.BioSequence) string {
return FormatFasta(sequence, header)
}
default:
log.Fatal("Unknown output format")
}
return f
}
func BuildFastxFormater(format string, header FormatHeader) BioSequenceBatchFormater {
fs := BuildFastxSeqFormater(format, header)
f := func(batch obiiter.BioSequenceBatch) []byte {
var bs bytes.Buffer
for _, seq := range batch.Slice() {
bs.WriteString(fs(seq))
bs.WriteString("\n")
}
return bs.Bytes()
}
return f
}

View File

@ -0,0 +1,185 @@
package obiformats
import (
"bufio"
"bytes"
"encoding/json"
"io"
"os"
"sync"
"time"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
func JSONRecord(sequence *obiseq.BioSequence) []byte {
record := make(map[string]interface{}, 4)
record["id"] = sequence.Id()
if sequence.HasSequence() {
record["sequence"] = sequence.String()
}
if sequence.HasQualities() {
record["qualities"] = sequence.QualitiesString()
}
if sequence.HasAnnotation() {
record["annotations"] = sequence.Annotations()
}
text, error := json.MarshalIndent(record, " ", " ")
if error != nil {
log.Panicf("conversion to JSON error on sequence id %s", sequence.Id())
}
return text
}
func FormatJSONBatch(batch obiiter.BioSequenceBatch) []byte {
buff := new(bytes.Buffer)
json := bufio.NewWriter(buff)
n := batch.Slice().Len() - 1
for i, s := range batch.Slice() {
json.WriteString(" ")
json.Write(JSONRecord(s))
if i < n {
json.WriteString(",\n")
}
}
json.Flush()
return buff.Bytes()
}
func WriteJSON(iterator obiiter.IBioSequence,
file io.WriteCloser,
options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options)
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
newIter := obiiter.MakeIBioSequence()
nwriters := opt.ParallelWorkers()
obiiter.RegisterAPipe()
chunkchan := make(chan FileChunck)
newIter.Add(nwriters)
var waitWriter sync.WaitGroup
go func() {
newIter.WaitAndClose()
for len(chunkchan) > 0 {
time.Sleep(time.Millisecond)
}
close(chunkchan)
waitWriter.Wait()
}()
ff := func(iterator obiiter.IBioSequence) {
for iterator.Next() {
batch := iterator.Get()
chunkchan <- FileChunck{
FormatJSONBatch(batch),
batch.Order(),
}
newIter.Push(batch)
}
newIter.Done()
}
next_to_send := 0
received := make(map[int]FileChunck, 100)
waitWriter.Add(1)
go func() {
for chunk := range chunkchan {
if chunk.order == next_to_send {
if next_to_send > 0 {
file.Write([]byte(",\n"))
}
file.Write(chunk.text)
next_to_send++
chunk, ok := received[next_to_send]
for ok {
file.Write(chunk.text)
delete(received, next_to_send)
next_to_send++
chunk, ok = received[next_to_send]
}
} else {
received[chunk.order] = chunk
}
}
file.Write([]byte("\n]\n"))
file.Close()
log.Debugln("End of the JSON file writing")
obiiter.UnregisterPipe()
waitWriter.Done()
}()
log.Debugln("Start of the JSON file writing")
file.Write([]byte("[\n"))
go ff(iterator)
for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split())
}
return newIter, nil
}
func WriteJSONToStdout(iterator obiiter.IBioSequence,
options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionDontCloseFile())
return WriteJSON(iterator, os.Stdout, options...)
}
func WriteJSONToFile(iterator obiiter.IBioSequence,
filename string,
options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options)
flags := os.O_WRONLY | os.O_CREATE
if opt.AppendFile() {
flags |= os.O_APPEND
}
file, err := os.OpenFile(filename, flags, 0660)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
options = append(options, OptionCloseFile())
iterator, err = WriteJSON(iterator, file, options...)
if opt.HaveToSavePaired() {
var revfile *os.File
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
iterator, err = WriteJSON(iterator.PairedWith(), revfile, options...)
}
return iterator, err
}

View File

@ -18,6 +18,7 @@ var __input_genbank_format__ = false
var __output_in_fasta__ = false
var __output_in_fastq__ = false
var __output_in_json__ = false
var __output_fastjson_format__ = false
var __output_fastobi_format__ = false
@ -77,10 +78,13 @@ func OutputModeOptionSet(options *getoptions.GetOpt) {
func OutputOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__output_in_fasta__, "fasta-output", false,
options.Description("Read data following the ecoPCR output format."))
options.Description("Write sequence in fasta format (default if no quality data available)."))
options.BoolVar(&__output_in_fastq__, "fastq-output", false,
options.Description("Read data following the EMBL flatfile format."))
options.Description("Write sequence in fastq format (default if quality data available)."))
options.BoolVar(&__output_in_json__, "json-output", false,
options.Description("Write sequence in json format."))
options.BoolVar(&__output_fastjson_format__, "output-json-header", false,
options.Description("output FASTA/FASTQ title line annotations follow json format."))
@ -130,6 +134,8 @@ func CLIOutputFormat() string {
return "fastq"
case __output_in_fasta__:
return "fasta"
case __output_in_json__:
return "json"
default:
return "guessed"
}

View File

@ -87,6 +87,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
newIter, err = obiformats.WriteFastqToFile(iterator, fn, opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToFile(iterator, fn, opts...)
case "json":
newIter, err = obiformats.WriteJSONToFile(iterator, fn, opts...)
default:
newIter, err = obiformats.WriteSequencesToFile(iterator, fn, opts...)
}
@ -97,6 +99,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
newIter, err = obiformats.WriteFastqToStdout(iterator, opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToStdout(iterator, opts...)
case "json":
newIter, err = obiformats.WriteJSONToStdout(iterator, opts...)
default:
newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...)
}