mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds a JSON output format
Former-commit-id: 26f07460772c0f735bf705d473f892878d3e57f0
This commit is contained in:
@ -12,17 +12,24 @@
|
|||||||
- A new file format guesser is now implemented. This is a first step towards allowing new formats to be managed by obitools.
|
- A new file format guesser is now implemented. This is a first step towards allowing new formats to be managed by obitools.
|
||||||
- New way of handling header definitions of fasta and fastq formats with JSON headers.
|
- New way of handling header definitions of fasta and fastq formats with JSON headers.
|
||||||
The sequence definition is now printed in new files as an attribute of the json header named "definition".
|
The sequence definition is now printed in new files as an attribute of the json header named "definition".
|
||||||
- The -D (--delta) option has been added to `obipcr`. It allows to extract flanking sequences of the barcode.
|
- The -D (--delta) option has been added to `obipcr`. It allows extracting flanking sequences of the barcode.
|
||||||
+ If -D is not set, the output sequence is the barcode itself without the priming sites.
|
+ If -D is not set, the output sequence is the barcode itself without the priming sites.
|
||||||
+ If -D is set to 0, the output sequence is the barcode with the priming sites.
|
+ If -D is set to 0, the output sequence is the barcode with the priming sites.
|
||||||
+ When -D is set to ### (where ### is an integer), the output sequence is the barcode with the priming sites.
|
+ When -D is set to ### (where ### is an integer), the output sequence is the barcode with the priming sites.
|
||||||
and ### base pairs of flanking sequences.
|
and ### base pairs of flanking sequences.
|
||||||
|
- A new output format in JSON is proposed using the **--json-output**. The sequence file is printed as a JSON vector,
|
||||||
|
where each element is a map corresponding to a sequence. The map has at most for elements:
|
||||||
|
+ *"id"* : which is the only mandatory element (string)
|
||||||
|
+ *"sequence"* : if sequence data is present in the record (string)
|
||||||
|
+ *"qualities"* : if quality data is associated to the record (string)
|
||||||
|
+ *"annotations"* : annotations is associated to the record (a map of annotations).
|
||||||
|
|
||||||
|
|
||||||
### Bugs
|
### Bugs
|
||||||
|
|
||||||
- in the obitools language, the `composition` function now returns a map indexded by lowercase string "a", "c", "g", "t" and "o" for other instead of being indexed by the ascii codes of the corresponding letters.
|
- in the obitools language, the `composition` function now returns a map indexed by lowercase string "a", "c", "g", "t" and "o" for other instead of being indexed by the ASCII codes of the corresponding letters.
|
||||||
- Correction of the reverse-complement operation. Every reverse complement of the DNA sequence follow now the following rules :
|
- Correction of the reverse-complement operation. Every reverse complement of the DNA sequence follow now the following rules :
|
||||||
+ Nucleotides code are complemented to their lower complementary base
|
+ Nucleotide codes are complemented to their lower complementary base
|
||||||
+ `.` and `-` characters are returned without change
|
+ `.` and `-` characters are returned without change
|
||||||
+ `[` is complemented to `]` and oppositely
|
+ `[` is complemented to `]` and oppositely
|
||||||
+ all other characters are complemented as `n`
|
+ all other characters are complemented as `n`
|
||||||
|
47
pkg/obiformats/fastqseq_write_generic.go
Normal file
47
pkg/obiformats/fastqseq_write_generic.go
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
package obiformats
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
)
|
||||||
|
|
||||||
|
type BioSequenceBatchFormater func(batch obiiter.BioSequenceBatch) []byte
|
||||||
|
type BioSequenceFormater func(sequence *obiseq.BioSequence) string
|
||||||
|
|
||||||
|
func BuildFastxSeqFormater(format string, header FormatHeader) BioSequenceFormater {
|
||||||
|
var f BioSequenceFormater
|
||||||
|
|
||||||
|
switch format {
|
||||||
|
case "fastq":
|
||||||
|
f = func(sequence *obiseq.BioSequence) string {
|
||||||
|
return FormatFastq(sequence, header)
|
||||||
|
}
|
||||||
|
case "fasta":
|
||||||
|
f = func(sequence *obiseq.BioSequence) string {
|
||||||
|
return FormatFasta(sequence, header)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
log.Fatal("Unknown output format")
|
||||||
|
}
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
func BuildFastxFormater(format string, header FormatHeader) BioSequenceBatchFormater {
|
||||||
|
fs := BuildFastxSeqFormater(format, header)
|
||||||
|
|
||||||
|
f := func(batch obiiter.BioSequenceBatch) []byte {
|
||||||
|
var bs bytes.Buffer
|
||||||
|
for _, seq := range batch.Slice() {
|
||||||
|
bs.WriteString(fs(seq))
|
||||||
|
bs.WriteString("\n")
|
||||||
|
}
|
||||||
|
return bs.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
185
pkg/obiformats/json_writer.go
Normal file
185
pkg/obiformats/json_writer.go
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
package obiformats
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
|
"encoding/json"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
func JSONRecord(sequence *obiseq.BioSequence) []byte {
|
||||||
|
record := make(map[string]interface{}, 4)
|
||||||
|
|
||||||
|
record["id"] = sequence.Id()
|
||||||
|
|
||||||
|
if sequence.HasSequence() {
|
||||||
|
record["sequence"] = sequence.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
if sequence.HasQualities() {
|
||||||
|
record["qualities"] = sequence.QualitiesString()
|
||||||
|
}
|
||||||
|
|
||||||
|
if sequence.HasAnnotation() {
|
||||||
|
record["annotations"] = sequence.Annotations()
|
||||||
|
}
|
||||||
|
|
||||||
|
text, error := json.MarshalIndent(record, " ", " ")
|
||||||
|
|
||||||
|
if error != nil {
|
||||||
|
log.Panicf("conversion to JSON error on sequence id %s", sequence.Id())
|
||||||
|
}
|
||||||
|
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
|
||||||
|
func FormatJSONBatch(batch obiiter.BioSequenceBatch) []byte {
|
||||||
|
buff := new(bytes.Buffer)
|
||||||
|
json := bufio.NewWriter(buff)
|
||||||
|
n := batch.Slice().Len() - 1
|
||||||
|
for i, s := range batch.Slice() {
|
||||||
|
json.WriteString(" ")
|
||||||
|
json.Write(JSONRecord(s))
|
||||||
|
if i < n {
|
||||||
|
json.WriteString(",\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json.Flush()
|
||||||
|
|
||||||
|
return buff.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
func WriteJSON(iterator obiiter.IBioSequence,
|
||||||
|
file io.WriteCloser,
|
||||||
|
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
|
|
||||||
|
opt := MakeOptions(options)
|
||||||
|
|
||||||
|
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||||
|
|
||||||
|
newIter := obiiter.MakeIBioSequence()
|
||||||
|
|
||||||
|
nwriters := opt.ParallelWorkers()
|
||||||
|
|
||||||
|
obiiter.RegisterAPipe()
|
||||||
|
chunkchan := make(chan FileChunck)
|
||||||
|
|
||||||
|
newIter.Add(nwriters)
|
||||||
|
var waitWriter sync.WaitGroup
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
newIter.WaitAndClose()
|
||||||
|
for len(chunkchan) > 0 {
|
||||||
|
time.Sleep(time.Millisecond)
|
||||||
|
}
|
||||||
|
close(chunkchan)
|
||||||
|
waitWriter.Wait()
|
||||||
|
}()
|
||||||
|
|
||||||
|
ff := func(iterator obiiter.IBioSequence) {
|
||||||
|
for iterator.Next() {
|
||||||
|
|
||||||
|
batch := iterator.Get()
|
||||||
|
|
||||||
|
chunkchan <- FileChunck{
|
||||||
|
FormatJSONBatch(batch),
|
||||||
|
batch.Order(),
|
||||||
|
}
|
||||||
|
newIter.Push(batch)
|
||||||
|
}
|
||||||
|
newIter.Done()
|
||||||
|
}
|
||||||
|
|
||||||
|
next_to_send := 0
|
||||||
|
received := make(map[int]FileChunck, 100)
|
||||||
|
|
||||||
|
waitWriter.Add(1)
|
||||||
|
go func() {
|
||||||
|
for chunk := range chunkchan {
|
||||||
|
if chunk.order == next_to_send {
|
||||||
|
if next_to_send > 0 {
|
||||||
|
file.Write([]byte(",\n"))
|
||||||
|
}
|
||||||
|
file.Write(chunk.text)
|
||||||
|
next_to_send++
|
||||||
|
chunk, ok := received[next_to_send]
|
||||||
|
for ok {
|
||||||
|
file.Write(chunk.text)
|
||||||
|
delete(received, next_to_send)
|
||||||
|
next_to_send++
|
||||||
|
chunk, ok = received[next_to_send]
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
received[chunk.order] = chunk
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
file.Write([]byte("\n]\n"))
|
||||||
|
file.Close()
|
||||||
|
|
||||||
|
log.Debugln("End of the JSON file writing")
|
||||||
|
obiiter.UnregisterPipe()
|
||||||
|
waitWriter.Done()
|
||||||
|
|
||||||
|
}()
|
||||||
|
|
||||||
|
log.Debugln("Start of the JSON file writing")
|
||||||
|
file.Write([]byte("[\n"))
|
||||||
|
go ff(iterator)
|
||||||
|
for i := 0; i < nwriters-1; i++ {
|
||||||
|
go ff(iterator.Split())
|
||||||
|
}
|
||||||
|
|
||||||
|
return newIter, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func WriteJSONToStdout(iterator obiiter.IBioSequence,
|
||||||
|
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
|
options = append(options, OptionDontCloseFile())
|
||||||
|
return WriteJSON(iterator, os.Stdout, options...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func WriteJSONToFile(iterator obiiter.IBioSequence,
|
||||||
|
filename string,
|
||||||
|
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
|
|
||||||
|
opt := MakeOptions(options)
|
||||||
|
flags := os.O_WRONLY | os.O_CREATE
|
||||||
|
|
||||||
|
if opt.AppendFile() {
|
||||||
|
flags |= os.O_APPEND
|
||||||
|
}
|
||||||
|
file, err := os.OpenFile(filename, flags, 0660)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("open file error: %v", err)
|
||||||
|
return obiiter.NilIBioSequence, err
|
||||||
|
}
|
||||||
|
|
||||||
|
options = append(options, OptionCloseFile())
|
||||||
|
|
||||||
|
iterator, err = WriteJSON(iterator, file, options...)
|
||||||
|
|
||||||
|
if opt.HaveToSavePaired() {
|
||||||
|
var revfile *os.File
|
||||||
|
|
||||||
|
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("open file error: %v", err)
|
||||||
|
return obiiter.NilIBioSequence, err
|
||||||
|
}
|
||||||
|
iterator, err = WriteJSON(iterator.PairedWith(), revfile, options...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return iterator, err
|
||||||
|
}
|
@ -18,6 +18,7 @@ var __input_genbank_format__ = false
|
|||||||
|
|
||||||
var __output_in_fasta__ = false
|
var __output_in_fasta__ = false
|
||||||
var __output_in_fastq__ = false
|
var __output_in_fastq__ = false
|
||||||
|
var __output_in_json__ = false
|
||||||
var __output_fastjson_format__ = false
|
var __output_fastjson_format__ = false
|
||||||
var __output_fastobi_format__ = false
|
var __output_fastobi_format__ = false
|
||||||
|
|
||||||
@ -77,10 +78,13 @@ func OutputModeOptionSet(options *getoptions.GetOpt) {
|
|||||||
|
|
||||||
func OutputOptionSet(options *getoptions.GetOpt) {
|
func OutputOptionSet(options *getoptions.GetOpt) {
|
||||||
options.BoolVar(&__output_in_fasta__, "fasta-output", false,
|
options.BoolVar(&__output_in_fasta__, "fasta-output", false,
|
||||||
options.Description("Read data following the ecoPCR output format."))
|
options.Description("Write sequence in fasta format (default if no quality data available)."))
|
||||||
|
|
||||||
options.BoolVar(&__output_in_fastq__, "fastq-output", false,
|
options.BoolVar(&__output_in_fastq__, "fastq-output", false,
|
||||||
options.Description("Read data following the EMBL flatfile format."))
|
options.Description("Write sequence in fastq format (default if quality data available)."))
|
||||||
|
|
||||||
|
options.BoolVar(&__output_in_json__, "json-output", false,
|
||||||
|
options.Description("Write sequence in json format."))
|
||||||
|
|
||||||
options.BoolVar(&__output_fastjson_format__, "output-json-header", false,
|
options.BoolVar(&__output_fastjson_format__, "output-json-header", false,
|
||||||
options.Description("output FASTA/FASTQ title line annotations follow json format."))
|
options.Description("output FASTA/FASTQ title line annotations follow json format."))
|
||||||
@ -130,6 +134,8 @@ func CLIOutputFormat() string {
|
|||||||
return "fastq"
|
return "fastq"
|
||||||
case __output_in_fasta__:
|
case __output_in_fasta__:
|
||||||
return "fasta"
|
return "fasta"
|
||||||
|
case __output_in_json__:
|
||||||
|
return "json"
|
||||||
default:
|
default:
|
||||||
return "guessed"
|
return "guessed"
|
||||||
}
|
}
|
||||||
|
@ -87,6 +87,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
|
|||||||
newIter, err = obiformats.WriteFastqToFile(iterator, fn, opts...)
|
newIter, err = obiformats.WriteFastqToFile(iterator, fn, opts...)
|
||||||
case "fasta":
|
case "fasta":
|
||||||
newIter, err = obiformats.WriteFastaToFile(iterator, fn, opts...)
|
newIter, err = obiformats.WriteFastaToFile(iterator, fn, opts...)
|
||||||
|
case "json":
|
||||||
|
newIter, err = obiformats.WriteJSONToFile(iterator, fn, opts...)
|
||||||
default:
|
default:
|
||||||
newIter, err = obiformats.WriteSequencesToFile(iterator, fn, opts...)
|
newIter, err = obiformats.WriteSequencesToFile(iterator, fn, opts...)
|
||||||
}
|
}
|
||||||
@ -97,6 +99,8 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
|
|||||||
newIter, err = obiformats.WriteFastqToStdout(iterator, opts...)
|
newIter, err = obiformats.WriteFastqToStdout(iterator, opts...)
|
||||||
case "fasta":
|
case "fasta":
|
||||||
newIter, err = obiformats.WriteFastaToStdout(iterator, opts...)
|
newIter, err = obiformats.WriteFastaToStdout(iterator, opts...)
|
||||||
|
case "json":
|
||||||
|
newIter, err = obiformats.WriteJSONToStdout(iterator, opts...)
|
||||||
default:
|
default:
|
||||||
newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...)
|
newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...)
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user