mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 08:40:26 +00:00
Patch rev complement and first implementation of --auto in obicsv
Former-commit-id: f3020e81283b1073c4d1c2d2ff0887e3998e6764
This commit is contained in:
@@ -10,6 +10,7 @@ import (
|
||||
"time"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
@@ -65,7 +66,7 @@ func CSVRecord(sequence *obiseq.BioSequence, opt Options) []string {
|
||||
l := sequence.Len()
|
||||
q := sequence.Qualities()
|
||||
ascii := make([]byte, l)
|
||||
quality_shift := opt.QualityShift()
|
||||
quality_shift := obioptions.OutputQualityShift()
|
||||
for j := 0; j < l; j++ {
|
||||
ascii[j] = uint8(q[j]) + uint8(quality_shift)
|
||||
}
|
||||
@@ -130,6 +131,8 @@ func FormatCVSBatch(batch obiiter.BioSequenceBatch, opt Options) []byte {
|
||||
func WriteCSV(iterator obiiter.IBioSequence,
|
||||
file io.WriteCloser,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
|
||||
var auto_slot obiutils.Set[string]
|
||||
opt := MakeOptions(options)
|
||||
|
||||
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
@@ -167,12 +170,6 @@ func WriteCSV(iterator obiiter.IBioSequence,
|
||||
newIter.Done()
|
||||
}
|
||||
|
||||
log.Debugln("Start of the CSV file writing")
|
||||
go ff(iterator)
|
||||
for i := 0; i < nwriters-1; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]FileChunck, 100)
|
||||
|
||||
@@ -203,6 +200,25 @@ func WriteCSV(iterator obiiter.IBioSequence,
|
||||
|
||||
}()
|
||||
|
||||
if opt.pointer.csv_auto {
|
||||
if iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
auto_slot = batch.Slice().AttributeKeys(true)
|
||||
CSVKeys(auto_slot.Members())(opt)
|
||||
chunkchan <- FileChunck{
|
||||
FormatCVSBatch(batch, opt),
|
||||
batch.Order(),
|
||||
}
|
||||
newIter.Push(batch)
|
||||
}
|
||||
}
|
||||
|
||||
log.Debugln("Start of the CSV file writing")
|
||||
go ff(iterator)
|
||||
for i := 0; i < nwriters-1; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
|
||||
return newIter, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -234,7 +234,7 @@ func ParseFastaChunk(source string, ch FastxChunk) *obiiter.BioSequenceBatch {
|
||||
C = C + 'a' - 'A'
|
||||
}
|
||||
// Removing white space from the sequence
|
||||
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
|
||||
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
||||
ch.Bytes[current] = C
|
||||
current++
|
||||
}
|
||||
|
||||
@@ -329,7 +329,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
parser := func() {
|
||||
defer out.Done()
|
||||
for chk := range chkchan {
|
||||
seqs := ParseFastqChunk(source, chk, byte(opt.QualityShift()))
|
||||
seqs := ParseFastqChunk(source, chk, byte(obioptions.InputQualityShift()))
|
||||
if seqs != nil {
|
||||
out.Push(*seqs)
|
||||
} else {
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||
)
|
||||
@@ -91,7 +92,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
||||
name := C.CString(filename)
|
||||
defer C.free(unsafe.Pointer(name))
|
||||
|
||||
pointer := C.open_fast_sek_file(name, C.int32_t(opt.QualityShift()))
|
||||
pointer := C.open_fast_sek_file(name, C.int32_t(obioptions.InputQualityShift()))
|
||||
|
||||
var err error
|
||||
err = nil
|
||||
@@ -150,7 +151,7 @@ func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence {
|
||||
}(newIter)
|
||||
|
||||
go _FastseqReader(opt.Source(),
|
||||
C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())),
|
||||
C.open_fast_sek_stdin(C.int32_t(obioptions.InputQualityShift())),
|
||||
newIter, opt.BatchSize())
|
||||
|
||||
log.Debugln("Full file batch mode : ", opt.FullFileBatch())
|
||||
|
||||
@@ -11,21 +11,16 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||
)
|
||||
|
||||
// The function FormatFastq takes a BioSequence object, a quality shift value, and a header formatter
|
||||
// function as input, and returns a formatted string in FASTQ format.
|
||||
func FormatFastq(seq *obiseq.BioSequence, quality_shift int, formater FormatHeader) string {
|
||||
func FormatFastq(seq *obiseq.BioSequence, formater FormatHeader) string {
|
||||
|
||||
l := seq.Len()
|
||||
q := seq.Qualities()
|
||||
ascii := make([]byte, seq.Len())
|
||||
|
||||
for j := 0; j < l; j++ {
|
||||
ascii[j] = uint8(q[j]) + uint8(quality_shift)
|
||||
}
|
||||
q := seq.QualitiesString()
|
||||
|
||||
info := ""
|
||||
if formater != nil {
|
||||
@@ -34,8 +29,8 @@ func FormatFastq(seq *obiseq.BioSequence, quality_shift int, formater FormatHead
|
||||
|
||||
return fmt.Sprintf("@%s %s\n%s\n+\n%s",
|
||||
seq.Id(), info,
|
||||
string(seq.Sequence()),
|
||||
string(ascii),
|
||||
seq.String(),
|
||||
q,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -44,7 +39,7 @@ func FormatFastqBatch(batch obiiter.BioSequenceBatch, quality_shift int,
|
||||
var bs bytes.Buffer
|
||||
for _, seq := range batch.Slice() {
|
||||
if seq.Len() > 0 {
|
||||
bs.WriteString(FormatFastq(seq, quality_shift, formater))
|
||||
bs.WriteString(FormatFastq(seq, formater))
|
||||
bs.WriteString("\n")
|
||||
} else {
|
||||
if skipEmpty {
|
||||
@@ -81,7 +76,7 @@ func WriteFastq(iterator obiiter.IBioSequence,
|
||||
chunkchan := make(chan FileChunck)
|
||||
|
||||
header_format := opt.FormatFastSeqHeader()
|
||||
quality := opt.QualityShift()
|
||||
quality := obioptions.OutputQualityShift()
|
||||
|
||||
newIter.Add(nwriters)
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ type __options__ struct {
|
||||
buffer_size int
|
||||
batch_size int
|
||||
full_file_batch bool
|
||||
quality_shift int
|
||||
parallel_workers int
|
||||
closefile bool
|
||||
appendfile bool
|
||||
@@ -27,6 +26,7 @@ type __options__ struct {
|
||||
csv_keys []string
|
||||
csv_separator string
|
||||
csv_navalue string
|
||||
csv_auto bool
|
||||
paired_filename string
|
||||
source string
|
||||
}
|
||||
@@ -43,7 +43,6 @@ func MakeOptions(setters []WithOption) Options {
|
||||
fastseq_header_writer: FormatFastSeqJsonHeader,
|
||||
with_progress_bar: false,
|
||||
buffer_size: 2,
|
||||
quality_shift: 33,
|
||||
parallel_workers: obioptions.CLIReadParallelWorkers(),
|
||||
batch_size: obioptions.CLIBatchSize(),
|
||||
full_file_batch: false,
|
||||
@@ -60,6 +59,7 @@ func MakeOptions(setters []WithOption) Options {
|
||||
csv_separator: ",",
|
||||
csv_navalue: "NA",
|
||||
csv_keys: make([]string, 0),
|
||||
csv_auto: false,
|
||||
paired_filename: "",
|
||||
source: "",
|
||||
}
|
||||
@@ -73,10 +73,6 @@ func MakeOptions(setters []WithOption) Options {
|
||||
return opt
|
||||
}
|
||||
|
||||
func (opt Options) QualityShift() int {
|
||||
return opt.pointer.quality_shift
|
||||
}
|
||||
|
||||
func (opt Options) BatchSize() int {
|
||||
return opt.pointer.batch_size
|
||||
}
|
||||
@@ -153,6 +149,10 @@ func (opt Options) CSVNAValue() string {
|
||||
return opt.pointer.csv_navalue
|
||||
}
|
||||
|
||||
func (opt Options) CSVAutoColumn() bool {
|
||||
return opt.pointer.csv_auto
|
||||
}
|
||||
|
||||
func (opt Options) HaveToSavePaired() bool {
|
||||
return opt.pointer.paired_filename != ""
|
||||
}
|
||||
@@ -217,31 +217,6 @@ func OptionsNewFile() WithOption {
|
||||
return f
|
||||
}
|
||||
|
||||
// Allows to specify the ascii code corresponding to
|
||||
// a quality of 0 in fastq encoded quality scores.
|
||||
func OptionsQualityShift(shift int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.quality_shift = shift
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
// Allows to specify a quality shift of 33, corresponding
|
||||
// to a FastQ file qualities encoded following Sanger
|
||||
// convention. This corresponds to Illumina produced FastQ
|
||||
// files.
|
||||
func OptionsQualitySanger() WithOption {
|
||||
return OptionsQualityShift(33)
|
||||
}
|
||||
|
||||
// Allows to specify a quality shift of 64, corresponding
|
||||
// to a FastQ file qualities encoded following the Solexa
|
||||
// convention.
|
||||
func OptionsQualitySolexa() WithOption {
|
||||
return OptionsQualityShift(64)
|
||||
}
|
||||
|
||||
func OptionsFastSeqHeaderParser(parser obiseq.SeqAnnotator) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.fastseq_header_parser = parser
|
||||
@@ -403,3 +378,11 @@ func CSVNAValue(navalue string) WithOption {
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVAutoColumn(auto bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_auto = auto
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user