mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds a JSON output format
Former-commit-id: 26f07460772c0f735bf705d473f892878d3e57f0
This commit is contained in:
@ -12,17 +12,24 @@
|
||||
- A new file format guesser is now implemented. This is a first step towards allowing new formats to be managed by obitools.
|
||||
- New way of handling header definitions of fasta and fastq formats with JSON headers.
|
||||
The sequence definition is now printed in new files as an attribute of the json header named "definition".
|
||||
- The -D (--delta) option has been added to `obipcr`. It allows to extract flanking sequences of the barcode.
|
||||
- The -D (--delta) option has been added to `obipcr`. It allows extracting flanking sequences of the barcode.
|
||||
+ If -D is not set, the output sequence is the barcode itself without the priming sites.
|
||||
+ If -D is set to 0, the output sequence is the barcode with the priming sites.
|
||||
+ When -D is set to ### (where ### is an integer), the output sequence is the barcode with the priming sites.
|
||||
and ### base pairs of flanking sequences.
|
||||
- A new output format in JSON is proposed using the **--json-output**. The sequence file is printed as a JSON vector,
|
||||
where each element is a map corresponding to a sequence. The map has at most for elements:
|
||||
+ *"id"* : which is the only mandatory element (string)
|
||||
+ *"sequence"* : if sequence data is present in the record (string)
|
||||
+ *"qualities"* : if quality data is associated to the record (string)
|
||||
+ *"annotations"* : annotations is associated to the record (a map of annotations).
|
||||
|
||||
|
||||
### Bugs
|
||||
|
||||
- in the obitools language, the `composition` function now returns a map indexded by lowercase string "a", "c", "g", "t" and "o" for other instead of being indexed by the ascii codes of the corresponding letters.
|
||||
- in the obitools language, the `composition` function now returns a map indexed by lowercase string "a", "c", "g", "t" and "o" for other instead of being indexed by the ASCII codes of the corresponding letters.
|
||||
- Correction of the reverse-complement operation. Every reverse complement of the DNA sequence follow now the following rules :
|
||||
+ Nucleotides code are complemented to their lower complementary base
|
||||
+ Nucleotide codes are complemented to their lower complementary base
|
||||
+ `.` and `-` characters are returned without change
|
||||
+ `[` is complemented to `]` and oppositely
|
||||
+ all other characters are complemented as `n`
|
||||
|
47
pkg/obiformats/fastqseq_write_generic.go
Normal file
47
pkg/obiformats/fastqseq_write_generic.go
Normal file
@ -0,0 +1,47 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
type BioSequenceBatchFormater func(batch obiiter.BioSequenceBatch) []byte
|
||||
type BioSequenceFormater func(sequence *obiseq.BioSequence) string
|
||||
|
||||
func BuildFastxSeqFormater(format string, header FormatHeader) BioSequenceFormater {
|
||||
var f BioSequenceFormater
|
||||
|
||||
switch format {
|
||||
case "fastq":
|
||||
f = func(sequence *obiseq.BioSequence) string {
|
||||
return FormatFastq(sequence, header)
|
||||
}
|
||||
case "fasta":
|
||||
f = func(sequence *obiseq.BioSequence) string {
|
||||
return FormatFasta(sequence, header)
|
||||
}
|
||||
default:
|
||||
log.Fatal("Unknown output format")
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func BuildFastxFormater(format string, header FormatHeader) BioSequenceBatchFormater {
|
||||
fs := BuildFastxSeqFormater(format, header)
|
||||
|
||||
f := func(batch obiiter.BioSequenceBatch) []byte {
|
||||
var bs bytes.Buffer
|
||||
for _, seq := range batch.Slice() {
|
||||
bs.WriteString(fs(seq))
|
||||
bs.WriteString("\n")
|
||||
}
|
||||
return bs.Bytes()
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
185
pkg/obiformats/json_writer.go
Normal file
185
pkg/obiformats/json_writer.go
Normal file
@ -0,0 +1,185 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func JSONRecord(sequence *obiseq.BioSequence) []byte {
|
||||
record := make(map[string]interface{}, 4)
|
||||
|
||||
record["id"] = sequence.Id()
|
||||
|
||||
if sequence.HasSequence() {
|
||||
record["sequence"] = sequence.String()
|
||||
}
|
||||
|
||||
if sequence.HasQualities() {
|
||||
record["qualities"] = sequence.QualitiesString()
|
||||
}
|
||||
|
||||
if sequence.HasAnnotation() {
|
||||
record["annotations"] = sequence.Annotations()
|
||||
}
|
||||
|
||||
text, error := json.MarshalIndent(record, " ", " ")
|
||||
|
||||
if error != nil {
|
||||
log.Panicf("conversion to JSON error on sequence id %s", sequence.Id())
|
||||
}
|
||||
|
||||
return text
|
||||
}
|
||||
|
||||
func FormatJSONBatch(batch obiiter.BioSequenceBatch) []byte {
|
||||
buff := new(bytes.Buffer)
|
||||
json := bufio.NewWriter(buff)
|
||||
n := batch.Slice().Len() - 1
|
||||
for i, s := range batch.Slice() {
|
||||
json.WriteString(" ")
|
||||
json.Write(JSONRecord(s))
|
||||
if i < n {
|
||||
json.WriteString(",\n")
|
||||
}
|
||||
}
|
||||
|
||||
json.Flush()
|
||||
|
||||
return buff.Bytes()
|
||||
}
|
||||
|
||||
func WriteJSON(iterator obiiter.IBioSequence,
|
||||
file io.WriteCloser,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
|
||||
opt := MakeOptions(options)
|
||||
|
||||
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
obiiter.RegisterAPipe()
|
||||
chunkchan := make(chan FileChunck)
|
||||
|
||||
newIter.Add(nwriters)
|
||||
var waitWriter sync.WaitGroup
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
for len(chunkchan) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
close(chunkchan)
|
||||
waitWriter.Wait()
|
||||
}()
|
||||
|
||||
ff := func(iterator obiiter.IBioSequence) {
|
||||
for iterator.Next() {
|
||||
|
||||
batch := iterator.Get()
|
||||
|
||||
chunkchan <- FileChunck{
|
||||
FormatJSONBatch(batch),
|
||||
batch.Order(),
|
||||
}
|
||||
newIter.Push(batch)
|
||||
}
|
||||
newIter.Done()
|
||||
}
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]FileChunck, 100)
|
||||
|
||||
waitWriter.Add(1)
|
||||
go func() {
|
||||
for chunk := range chunkchan {
|
||||
if chunk.order == next_to_send {
|
||||
if next_to_send > 0 {
|
||||
file.Write([]byte(",\n"))
|
||||
}
|
||||
file.Write(chunk.text)
|
||||
next_to_send++
|
||||
chunk, ok := received[next_to_send]
|
||||
for ok {
|
||||
file.Write(chunk.text)
|
||||
delete(received, next_to_send)
|
||||
next_to_send++
|
||||
chunk, ok = received[next_to_send]
|
||||
}
|
||||
} else {
|
||||
received[chunk.order] = chunk
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
file.Write([]byte("\n]\n"))
|
||||
file.Close()
|
||||
|
||||
log.Debugln("End of the JSON file writing")
|
||||
obiiter.UnregisterPipe()
|
||||
waitWriter.Done()
|
||||
|
||||
}()
|
||||
|
||||
log.Debugln("Start of the JSON file writing")
|
||||
file.Write([]byte("[\n"))
|
||||
go ff(iterator)
|
||||
for i := 0; i < nwriters-1; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
|
||||
return newIter, nil
|
||||
}
|
||||
|
||||
func WriteJSONToStdout(iterator obiiter.IBioSequence,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionDontCloseFile())
|
||||
return WriteJSON(iterator, os.Stdout, options...)
|
||||
}
|
||||
|
||||
func WriteJSONToFile(iterator obiiter.IBioSequence,
|
||||
filename string,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
|
||||
opt := MakeOptions(options)
|
||||
flags := os.O_WRONLY | os.O_CREATE
|
||||
|
||||
if opt.AppendFile() {
|
||||
flags |= os.O_APPEND
|
||||
}
|
||||
file, err := os.OpenFile(filename, flags, 0660)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("open file error: %v", err)
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
options = append(options, OptionCloseFile())
|
||||
|
||||
iterator, err = WriteJSON(iterator, file, options...)
|
||||
|
||||
if opt.HaveToSavePaired() {
|
||||
var revfile *os.File
|
||||
|
||||
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
|
||||
if err != nil {
|
||||
log.Fatalf("open file error: %v", err)
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
iterator, err = WriteJSON(iterator.PairedWith(), revfile, options...)
|
||||
}
|
||||
|
||||
return iterator, err
|
||||
}
|
@ -18,6 +18,7 @@ var __input_genbank_format__ = false
|
||||
|
||||
var __output_in_fasta__ = false
|
||||
var __output_in_fastq__ = false
|
||||
var __output_in_json__ = false
|
||||
var __output_fastjson_format__ = false
|
||||
var __output_fastobi_format__ = false
|
||||
|
||||
@ -77,10 +78,13 @@ func OutputModeOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
func OutputOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&__output_in_fasta__, "fasta-output", false,
|
||||
options.Description("Read data following the ecoPCR output format."))
|
||||
options.Description("Write sequence in fasta format (default if no quality data available)."))
|
||||
|
||||
options.BoolVar(&__output_in_fastq__, "fastq-output", false,
|
||||
options.Description("Read data following the EMBL flatfile format."))
|
||||
options.Description("Write sequence in fastq format (default if quality data available)."))
|
||||
|
||||
options.BoolVar(&__output_in_json__, "json-output", false,
|
||||
options.Description("Write sequence in json format."))
|
||||
|
||||
options.BoolVar(&__output_fastjson_format__, "output-json-header", false,
|
||||
options.Description("output FASTA/FASTQ title line annotations follow json format."))
|
||||
@ -130,6 +134,8 @@ func CLIOutputFormat() string {
|
||||
return "fastq"
|
||||
case __output_in_fasta__:
|
||||
return "fasta"
|
||||
case __output_in_json__:
|
||||
return "json"
|
||||
default:
|
||||
return "guessed"
|
||||
}
|
||||
|
@ -87,6 +87,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
|
||||
newIter, err = obiformats.WriteFastqToFile(iterator, fn, opts...)
|
||||
case "fasta":
|
||||
newIter, err = obiformats.WriteFastaToFile(iterator, fn, opts...)
|
||||
case "json":
|
||||
newIter, err = obiformats.WriteJSONToFile(iterator, fn, opts...)
|
||||
default:
|
||||
newIter, err = obiformats.WriteSequencesToFile(iterator, fn, opts...)
|
||||
}
|
||||
@ -97,6 +99,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
|
||||
newIter, err = obiformats.WriteFastqToStdout(iterator, opts...)
|
||||
case "fasta":
|
||||
newIter, err = obiformats.WriteFastaToStdout(iterator, opts...)
|
||||
case "json":
|
||||
newIter, err = obiformats.WriteJSONToStdout(iterator, opts...)
|
||||
default:
|
||||
newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...)
|
||||
}
|
||||
|
Reference in New Issue
Block a user