Patch rev complement and first implementation of --auto in obicsv

Former-commit-id: f3020e81283b1073c4d1c2d2ff0887e3998e6764
This commit is contained in:
2023-11-07 09:37:07 +02:00
parent 6a6a6f6f2c
commit 61c30f9b6a
21 changed files with 270 additions and 107 deletions

View File

@ -10,6 +10,7 @@ import (
"time"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
log "github.com/sirupsen/logrus"
@ -65,7 +66,7 @@ func CSVRecord(sequence *obiseq.BioSequence, opt Options) []string {
l := sequence.Len()
q := sequence.Qualities()
ascii := make([]byte, l)
quality_shift := opt.QualityShift()
quality_shift := obioptions.OutputQualityShift()
for j := 0; j < l; j++ {
ascii[j] = uint8(q[j]) + uint8(quality_shift)
}
@ -130,6 +131,8 @@ func FormatCVSBatch(batch obiiter.BioSequenceBatch, opt Options) []byte {
func WriteCSV(iterator obiiter.IBioSequence,
file io.WriteCloser,
options ...WithOption) (obiiter.IBioSequence, error) {
var auto_slot obiutils.Set[string]
opt := MakeOptions(options)
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
@ -167,12 +170,6 @@ func WriteCSV(iterator obiiter.IBioSequence,
newIter.Done()
}
log.Debugln("Start of the CSV file writing")
go ff(iterator)
for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split())
}
next_to_send := 0
received := make(map[int]FileChunck, 100)
@ -203,6 +200,25 @@ func WriteCSV(iterator obiiter.IBioSequence,
}()
if opt.pointer.csv_auto {
if iterator.Next() {
batch := iterator.Get()
auto_slot = batch.Slice().AttributeKeys(true)
CSVKeys(auto_slot.Members())(opt)
chunkchan <- FileChunck{
FormatCVSBatch(batch, opt),
batch.Order(),
}
newIter.Push(batch)
}
}
log.Debugln("Start of the CSV file writing")
go ff(iterator)
for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split())
}
return newIter, nil
}

View File

@ -234,7 +234,7 @@ func ParseFastaChunk(source string, ch FastxChunk) *obiiter.BioSequenceBatch {
C = C + 'a' - 'A'
}
// Removing white space from the sequence
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' {
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
ch.Bytes[current] = C
current++
}

View File

@ -329,7 +329,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
parser := func() {
defer out.Done()
for chk := range chkchan {
seqs := ParseFastqChunk(source, chk, byte(opt.QualityShift()))
seqs := ParseFastqChunk(source, chk, byte(obioptions.InputQualityShift()))
if seqs != nil {
out.Push(*seqs)
} else {

View File

@ -15,6 +15,7 @@ import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
@ -91,7 +92,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
name := C.CString(filename)
defer C.free(unsafe.Pointer(name))
pointer := C.open_fast_sek_file(name, C.int32_t(opt.QualityShift()))
pointer := C.open_fast_sek_file(name, C.int32_t(obioptions.InputQualityShift()))
var err error
err = nil
@ -150,7 +151,7 @@ func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence {
}(newIter)
go _FastseqReader(opt.Source(),
C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())),
C.open_fast_sek_stdin(C.int32_t(obioptions.InputQualityShift())),
newIter, opt.BatchSize())
log.Debugln("Full file batch mode : ", opt.FullFileBatch())

View File

@ -11,21 +11,16 @@ import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
// The function FormatFastq takes a BioSequence object, a quality shift value, and a header formatter
// function as input, and returns a formatted string in FASTQ format.
func FormatFastq(seq *obiseq.BioSequence, quality_shift int, formater FormatHeader) string {
func FormatFastq(seq *obiseq.BioSequence, formater FormatHeader) string {
l := seq.Len()
q := seq.Qualities()
ascii := make([]byte, seq.Len())
for j := 0; j < l; j++ {
ascii[j] = uint8(q[j]) + uint8(quality_shift)
}
q := seq.QualitiesString()
info := ""
if formater != nil {
@ -34,8 +29,8 @@ func FormatFastq(seq *obiseq.BioSequence, quality_shift int, formater FormatHead
return fmt.Sprintf("@%s %s\n%s\n+\n%s",
seq.Id(), info,
string(seq.Sequence()),
string(ascii),
seq.String(),
q,
)
}
@ -44,7 +39,7 @@ func FormatFastqBatch(batch obiiter.BioSequenceBatch, quality_shift int,
var bs bytes.Buffer
for _, seq := range batch.Slice() {
if seq.Len() > 0 {
bs.WriteString(FormatFastq(seq, quality_shift, formater))
bs.WriteString(FormatFastq(seq, formater))
bs.WriteString("\n")
} else {
if skipEmpty {
@ -81,7 +76,7 @@ func WriteFastq(iterator obiiter.IBioSequence,
chunkchan := make(chan FileChunck)
header_format := opt.FormatFastSeqHeader()
quality := opt.QualityShift()
quality := obioptions.OutputQualityShift()
newIter.Add(nwriters)

View File

@ -12,7 +12,6 @@ type __options__ struct {
buffer_size int
batch_size int
full_file_batch bool
quality_shift int
parallel_workers int
closefile bool
appendfile bool
@ -27,6 +26,7 @@ type __options__ struct {
csv_keys []string
csv_separator string
csv_navalue string
csv_auto bool
paired_filename string
source string
}
@ -43,7 +43,6 @@ func MakeOptions(setters []WithOption) Options {
fastseq_header_writer: FormatFastSeqJsonHeader,
with_progress_bar: false,
buffer_size: 2,
quality_shift: 33,
parallel_workers: obioptions.CLIReadParallelWorkers(),
batch_size: obioptions.CLIBatchSize(),
full_file_batch: false,
@ -60,6 +59,7 @@ func MakeOptions(setters []WithOption) Options {
csv_separator: ",",
csv_navalue: "NA",
csv_keys: make([]string, 0),
csv_auto: false,
paired_filename: "",
source: "",
}
@ -73,10 +73,6 @@ func MakeOptions(setters []WithOption) Options {
return opt
}
func (opt Options) QualityShift() int {
return opt.pointer.quality_shift
}
func (opt Options) BatchSize() int {
return opt.pointer.batch_size
}
@ -153,6 +149,10 @@ func (opt Options) CSVNAValue() string {
return opt.pointer.csv_navalue
}
func (opt Options) CSVAutoColumn() bool {
return opt.pointer.csv_auto
}
func (opt Options) HaveToSavePaired() bool {
return opt.pointer.paired_filename != ""
}
@ -217,31 +217,6 @@ func OptionsNewFile() WithOption {
return f
}
// Allows to specify the ascii code corresponding to
// a quality of 0 in fastq encoded quality scores.
func OptionsQualityShift(shift int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.quality_shift = shift
})
return f
}
// Allows to specify a quality shift of 33, corresponding
// to a FastQ file qualities encoded following Sanger
// convention. This corresponds to Illumina produced FastQ
// files.
func OptionsQualitySanger() WithOption {
return OptionsQualityShift(33)
}
// Allows to specify a quality shift of 64, corresponding
// to a FastQ file qualities encoded following the Solexa
// convention.
func OptionsQualitySolexa() WithOption {
return OptionsQualityShift(64)
}
func OptionsFastSeqHeaderParser(parser obiseq.SeqAnnotator) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.fastseq_header_parser = parser
@ -403,3 +378,11 @@ func CSVNAValue(navalue string) WithOption {
return f
}
func CSVAutoColumn(auto bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_auto = auto
})
return f
}

View File

@ -19,6 +19,8 @@ var _ReadWorkerPerCore = 1.0
var _MaxAllowedCPU = runtime.NumCPU()
var _BatchSize = 5000
var _Pprof = false
var _Quality_Shift_Input = 33
var _Quality_Shift_Output = 33
type ArgumentParser func([]string) (*getoptions.GetOpt, []string)
@ -43,6 +45,10 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
options.GetEnv("OBIBATCHSIZE"),
options.Description("Number of sequence per batch for paralelle processing"))
options.Bool("solexa", false,
options.GetEnv("OBISOLEXA"),
options.Description("Decodes quality string according to the Solexa specification."))
for _, o := range optionset {
o(options)
}
@ -85,6 +91,15 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
}
log.Printf("Number of workers set %d", CLIParallelWorkers())
if options.Called("workers") {
}
if options.Called("solexa") {
SetInputQualityShift(64)
}
return options, remaining
}
}
@ -144,3 +159,19 @@ func ReadWorkerPerCore() float64 {
func SetBatchSize(n int) {
_BatchSize = n
}
func InputQualityShift() int {
return _Quality_Shift_Input
}
func OutputQualityShift() int {
return _Quality_Shift_Output
}
func SetInputQualityShift(n int) {
_Quality_Shift_Input = n
}
func SetOutputQualityShift(n int) {
_Quality_Shift_Output = n
}

View File

@ -8,6 +8,48 @@ import (
log "github.com/sirupsen/logrus"
)
// AttributeKeys returns the keys of the attributes in the BioSequence.
//
// It does not take any parameters.
//
// Returns:
//
// []string: The keys of the BioSequence.
func (s *BioSequence) AttributeKeys(skip_map bool) obiutils.Set[string] {
keys := obiutils.MakeSet[string]()
for k, v := range s.Annotations() {
if !skip_map || !obiutils.IsAMap(v) {
keys.Add(k)
}
}
return keys
}
// Keys returns the keys of the BioSequence.
//
// It returns a slice of strings containing the keys of the BioSequence.
// The keys include "id", "sequence", "qualities", and the attribute keys
// of the BioSequence.
//
// Returns:
//
// []string: The keys of the BioSequence.
func (s *BioSequence) Keys(skip_map bool) obiutils.Set[string] {
keys := s.AttributeKeys(skip_map)
keys.Add("id")
if s.HasSequence() {
keys.Add("sequence")
}
if s.HasQualities() {
keys.Add("qualities")
}
return keys
}
// HasAttribute checks if the BioSequence has the specified attribute.
//
// Parameters:
@ -16,6 +58,17 @@ import (
// Returns:
// - a boolean indicating whether the BioSequence has the attribute.
func (s *BioSequence) HasAttribute(key string) bool {
if key == "id" {
return true
}
if key == "sequence" && s.sequence != nil {
return true
}
if key == "qualities" && s.qualities != nil {
return true
}
ok := s.annotations != nil
if ok {
@ -36,6 +89,25 @@ func (s *BioSequence) HasAttribute(key string) bool {
// - val: The value associated with the given key.
// - ok: A boolean indicating whether the key exists in the annotations map.
func (s *BioSequence) GetAttribute(key string) (interface{}, bool) {
if key == "id" {
return s.id, true
}
if key == "sequence" {
if s.HasSequence() {
return s.String(), true
}
return nil, false
}
if key == "qualities" {
if s.HasQualities() {
return s.QualitiesString(), true
}
return nil, false
}
var val interface{}
ok := s.annotations != nil
@ -54,6 +126,17 @@ func (s *BioSequence) GetAttribute(key string) (interface{}, bool) {
// - key: the key to set the value for.
// - value: the value to set for the given key.
func (s *BioSequence) SetAttribute(key string, value interface{}) {
if key == "id" {
s.SetId(value.(string))
return
}
if key == "sequence" {
s.SetSequence(value.([]byte))
return
}
annot := s.Annotations()
defer s.AnnotationsUnlock()

View File

@ -15,6 +15,7 @@ import (
"sync"
"sync/atomic"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
@ -55,8 +56,7 @@ type Annotation map[string]interface{}
// A BioSequence is a sequence of bytes with an identifier, a definition, a sequence, qualities,
// features and annotations. It aims to represent a biological sequence
type BioSequence struct {
id string // The identidier of the sequence (private accessible through the method Id)
//definition string // The documentation of the sequence (private accessible through the method Definition)
id string // The identidier of the sequence (private accessible through the method Id)
source string // The filename without directory name and extension from where the sequence was read.
sequence []byte // The sequence itself, it is accessible by the methode Sequence
qualities []byte // The quality scores of the sequence.
@ -188,6 +188,14 @@ func (s *BioSequence) Definition() string {
return definition
}
// HasSequence checks if the BioSequence has a sequence.
//
// No parameters.
// Returns a boolean.
func (s *BioSequence) HasSequence() bool {
return s.sequence != nil && len(s.sequence) > 0
}
// Sequence returns the sequence of the BioSequence.
//
// Returns:
@ -217,7 +225,7 @@ func (s *BioSequence) Len() int {
// This function does not have any parameters.
// It returns a boolean value indicating whether the BioSequence has qualities.
func (s *BioSequence) HasQualities() bool {
return len(s.qualities) > 0
return s.qualities != nil && len(s.qualities) > 0
}
// Qualities returns the sequence quality scores of the BioSequence.
@ -235,6 +243,19 @@ func (s *BioSequence) Qualities() Quality {
return __make_default_qualities__(len(s.sequence))
}
// QualitiesString returns the string representation of the qualities of the BioSequence.
//
// Returns a string representing the qualities of the BioSequence after applying the shift.
func (s *BioSequence) QualitiesString() string {
quality_shift := obioptions.OutputQualityShift()
qual := s.Qualities()
qual_ascii := make([]byte, len(qual))
for i := 0; i < len(qual); i++ {
qual_ascii[i] = byte(qual[i] + byte(quality_shift))
}
return string(qual_ascii)
}
// Features returns the feature string of the BioSequence.
//
// The feature string contains the EMBL/GenBank not parsed feature table

View File

@ -3,6 +3,7 @@ package obiseq
import (
"sync"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
log "github.com/sirupsen/logrus"
"golang.org/x/exp/slices"
)
@ -171,3 +172,13 @@ func (s BioSequenceSlice) Size() int {
return size
}
func (s BioSequenceSlice) AttributeKeys(skip_map bool) obiutils.Set[string] {
keys := obiutils.MakeSet[string]()
for _, k := range s {
keys = keys.Union(k.AttributeKeys(skip_map))
}
return keys
}

View File

@ -198,6 +198,15 @@ var OBILang = gval.NewLanguage(
composition := (args[0].(*BioSequence)).Composition()
return float64(composition['g']-composition['c']) / float64(composition['g']+composition['c']), nil
}),
gval.Function("gc", func(args ...interface{}) (interface{}, error) {
composition := (args[0].(*BioSequence)).Composition()
return float64(composition['g']+composition['c']) / float64(args[0].(*BioSequence).Len()), nil
}),
gval.Function("composition", func(args ...interface{}) (interface{}, error) {
return (args[0].(*BioSequence)).Composition(), nil
comp := (args[0].(*BioSequence)).Composition()
scomp := make(map[string]float64)
for k, v := range comp {
scomp[string(k)] = float64(v)
}
return scomp, nil
}))

View File

@ -1,7 +1,17 @@
package obiseq
// ".ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
var _revcmpDNA = []byte(".TVGHEFCDIJMLKNOPQYSAABWXRZ#!][")
var _revcmpDNA = []byte(".TVGHNNCDNNMNKNNNNYSAABWNRN]N[NNN")
func complement(n byte) byte {
switch {
case n == '.' || n == '-':
return n
case (n >= 'A' && n <= 'z'):
return _revcmpDNA[n&31] | (n & 0x20)
}
return 'n'
}
// Reverse complements a DNA sequence.
// If the inplace parametter is true, that operation is done in place.
@ -18,8 +28,7 @@ func (sequence *BioSequence) ReverseComplement(inplace bool) *BioSequence {
// ASCII code & 31 -> builds an index in witch (a|A) is 1
// ASCII code & 0x20 -> Foce lower case
s[j], s[i] = _revcmpDNA[s[i]&31]|(s[i]&0x20),
_revcmpDNA[s[j]&31]|(s[j]&0x20)
s[j], s[i] = complement(s[i]), complement(s[j])
j++
}
@ -40,8 +49,7 @@ func (sequence *BioSequence) _revcmpMutation() *BioSequence {
b := []byte(m)
// Echange and reverse complement symboles
b[1], b[9] = _revcmpDNA[b[9]&31]|(b[9]&0x20),
_revcmpDNA[b[1]&31]|(b[1]&0x20)
b[1], b[9] = complement(b[9]), complement(b[1])
// Exchange sequencing scores
b[3], b[4], b[11], b[12] = b[11], b[12], b[3], b[4]
@ -65,7 +73,6 @@ func (sequence *BioSequence) _revcmpMutation() *BioSequence {
return sequence
}
func ReverseComplementWorker(inplace bool) SeqWorker {
f := func(input *BioSequence) *BioSequence {
return input.ReverseComplement(inplace)

View File

@ -16,13 +16,10 @@ var __input_ecopcr_format__ = false
var __input_embl_format__ = false
var __input_genbank_format__ = false
var __input_solexa_quality__ = false
var __output_in_fasta__ = false
var __output_in_fastq__ = false
var __output_fastjson_format__ = false
var __output_fastobi_format__ = false
var __output_solexa_quality__ = false
var __no_progress_bar__ = false
var __compressed__ = false
@ -54,9 +51,6 @@ func InputOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__input_genbank_format__, "genbank", __input_genbank_format__,
options.Description("Read data following the Genbank flatfile format."))
options.BoolVar(&__input_solexa_quality__, "solexa", __input_solexa_quality__,
options.Description("Decodes quality string according to the Solexa specification."))
options.BoolVar(&__no_ordered_input__, "no-order", __no_ordered_input__,
options.Description("When several input files are provided, "+
"indicates that there is no order among them."))
@ -71,7 +65,7 @@ func OutputModeOptionSet(options *getoptions.GetOpt) {
options.Alias("Z"),
options.Description("Output is compressed"))
options.BoolVar(&__skip_empty__, "skip-empty", __skip_empty__,
options.BoolVar(&__skip_empty__, "skip-empty", __skip_empty__,
options.Description("Sequences of length equal to zero are suppressed from the output"))
options.StringVar(&__output_file_name__, "out", __output_file_name__,
@ -146,7 +140,7 @@ func CLICompressed() bool {
}
func CLISkipEmpty() bool {
return __skip_empty__
return __skip_empty__
}
func CLIInputFastHeaderFormat() string {
@ -181,22 +175,6 @@ func CLIAnalyzeOnly() int {
return __read_only_entries__
}
func CLIInputQualityShift() int {
if __input_solexa_quality__ {
return 64
} else {
return 33
}
}
func CLIOutputQualityShift() int {
if __output_solexa_quality__ {
return 64
} else {
return 33
}
}
func CLIProgressBar() bool {
return !__no_progress_bar__
}
@ -217,4 +195,4 @@ func SetFullFileBatch() {
}
func FullFileBatch() bool {
return __full_file_batch__
}
}

View File

@ -98,10 +98,8 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(CLIInputQualityShift()))
opts = append(opts, obiformats.OptionsFullFileBatch(FullFileBatch()))
if len(filenames) == 0 {
log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat())
opts = append(opts, obiformats.OptionsSource("stdin"))

View File

@ -59,8 +59,6 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(CLIOutputQualityShift()))
opts = append(opts, obiformats.OptionsCompressed(CLICompressed()))
var err error

View File

@ -27,8 +27,6 @@ func CLIWriteCSV(iterator obiiter.IBioSequence,
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()))
opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed()))
opts = append(opts, obiformats.CSVId(CLIPrintId()),
@ -37,6 +35,7 @@ func CLIWriteCSV(iterator obiiter.IBioSequence,
obiformats.CSVDefinition(CLIPrintDefinition()),
obiformats.CSVKeys(CLIToBeKeptAttributes()),
obiformats.CSVSequence(CLIPrintSequence()),
obiformats.CSVAutoColumn(CLIAutoColumns()),
)
var err error

View File

@ -32,7 +32,6 @@ func DistributeSequence(sequences obiiter.IBioSequence) {
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers),
obiformats.OptionsBatchSize(obioptions.CLIBatchSize()),
obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()),
obiformats.OptionsAppendFile(CLIAppendSequences()),
obiformats.OptionsCompressed(obiconvert.CLICompressed()))

View File

@ -93,8 +93,6 @@ func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) {
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()))
opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed()))
var err error