From 61c30f9b6a33a8705e0318dbcb5cf82f6a95b6e4 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 7 Nov 2023 09:37:07 +0200 Subject: [PATCH] Patch rev complement and first implementation of --auto in obicsv Former-commit-id: f3020e81283b1073c4d1c2d2ff0887e3998e6764 --- Release-notes.md | 15 ++++ go.mod | 19 ++--- go.sum | 20 ++++++ pkg/obiformats/csv_writer.go | 30 ++++++-- pkg/obiformats/fastaseq_read.go | 2 +- pkg/obiformats/fastqseq_read.go | 2 +- pkg/obiformats/fastseq_read.go | 5 +- pkg/obiformats/fastseq_write_fastq.go | 19 ++--- pkg/obiformats/options.go | 45 ++++-------- pkg/obioptions/options.go | 31 ++++++++ pkg/obiseq/attributes.go | 83 ++++++++++++++++++++++ pkg/obiseq/biosequence.go | 27 ++++++- pkg/obiseq/biosequenceslice.go | 11 +++ pkg/obiseq/language.go | 11 ++- pkg/obiseq/revcomp.go | 19 +++-- pkg/obitools/obiconvert/options.go | 28 +------- pkg/obitools/obiconvert/sequence_reader.go | 2 - pkg/obitools/obiconvert/sequence_writer.go | 2 - pkg/obitools/obicsv/obicsv.go | 3 +- pkg/obitools/obidistribute/distribute.go | 1 - pkg/obitools/obitag/options.go | 2 - 21 files changed, 270 insertions(+), 107 deletions(-) diff --git a/Release-notes.md b/Release-notes.md index a8312ac..6d87887 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -2,6 +2,10 @@ ## Latest changes +### New feature + +- In the obitools language a new `gc` computes the gc fraction of a sequence. + ### Enhancement - A new completely rewritten GO version of the fastq and fasta parser is now used instead of the original C version. @@ -13,6 +17,17 @@ + If -D is set to 0, the output sequence is the barcode with the priming sites. + When -D is set to ### (where ### is an integer), the output sequence is the barcode with the priming sites. and ### base pairs of flanking sequences. + +### Bugs + +- in the obitools language, the `composition` function now returns a map indexded by lowercase string "a", "c", "g", "t" and "o" for other instead of being indexed by the ascii codes of the corresponding letters. +- Correction of the reverse-complement operation. Every reverse complement of the DNA sequence follow now the following rules : + + Nucleotides code are complemented to their lower complementary base + + `.` and `-` characters are returned without change + + `[` is complemented to `]` and oppositely + + all other characters are complemented as `n` + + ### Becareful GO 1.21.0 is out, and it includes new functionalities which are used in the OBITools4 code. diff --git a/go.mod b/go.mod index fd77a09..4e1a4ed 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module git.metabarcoding.org/lecasofts/go/obitools go 1.20 require ( - github.com/DavidGamba/go-getoptions v0.27.0 + github.com/DavidGamba/go-getoptions v0.28.0 github.com/PaesslerAG/gval v1.2.2 github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df github.com/chen3feng/stl4go v0.1.1 @@ -15,12 +15,13 @@ require ( github.com/sirupsen/logrus v1.9.3 github.com/stretchr/testify v1.7.0 github.com/tevino/abool/v2 v2.1.0 - golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 + golang.org/x/exp v0.0.0-20231006140011-7918f672742d gonum.org/v1/gonum v0.14.0 scientificgo.org/special v0.0.0 ) require ( + github.com/deckarep/golang-set/v2 v2.3.1 // indirect github.com/kr/pretty v0.2.1 // indirect github.com/kr/text v0.1.0 // indirect ) @@ -28,8 +29,8 @@ require ( require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/dsnet/compress v0.0.1 - github.com/gabriel-vasile/mimetype v1.4.2 // indirect - github.com/klauspost/compress v1.16.7 // indirect + github.com/gabriel-vasile/mimetype v1.4.3 + github.com/klauspost/compress v1.17.2 // indirect github.com/mattn/go-runewidth v0.0.15 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/pmezard/go-difflib v1.0.0 // indirect @@ -37,11 +38,11 @@ require ( github.com/shopspring/decimal v1.3.1 // indirect github.com/ulikunitz/xz v0.5.11 github.com/yuin/goldmark v1.4.13 // indirect - golang.org/x/mod v0.12.0 // indirect - golang.org/x/net v0.14.0 // indirect - golang.org/x/sys v0.11.0 // indirect - golang.org/x/term v0.11.0 // indirect - golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846 // indirect + golang.org/x/mod v0.13.0 // indirect + golang.org/x/net v0.17.0 // indirect + golang.org/x/sys v0.13.0 // indirect + golang.org/x/term v0.13.0 // indirect + golang.org/x/tools v0.14.0 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect ) diff --git a/go.sum b/go.sum index 41309d0..c7d4b51 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,7 @@ github.com/DavidGamba/go-getoptions v0.27.0 h1:hldKJSwO9SwvR+z9pe6ojhEcYECrRiO/bar9B7MnBKA= github.com/DavidGamba/go-getoptions v0.27.0/go.mod h1:qLaLSYeQ8sUVOfKuu5JT5qKKS3OCwyhkYSJnoG+ggmo= +github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c= +github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84= github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E= github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac= github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI= @@ -13,17 +15,23 @@ github.com/daichi-m/go18ds v1.12.1/go.mod h1:wc2dURUr8aMxxC4Mn5ObJGVM7uIKU8JagY4 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/deckarep/golang-set/v2 v2.3.1 h1:vjmkvJt/IV27WXPyYQpAh4bRyWJc5Y435D17XQ9QU5A= +github.com/deckarep/golang-set/v2 v2.3.1/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4= github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q= github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo= github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU= github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA= +github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0= +github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk= github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4= +github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= @@ -64,20 +72,32 @@ github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8= +golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= +golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= golang.org/x/mod v0.12.0 h1:rmsUpXtvNzj340zd98LZ4KntptpfRHwpFOHG188oHXc= golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY= +golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.14.0 h1:BONx9s002vGdD9umnlX1Po8vOZmrgH34qlHcD1MfK14= golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.11.0 h1:F9tnn/DA/Im8nCwm+fX+1/eBwi4qFjRT++MhtVC4ZX0= golang.org/x/term v0.11.0/go.mod h1:zC9APTIj3jG3FdV/Ons+XE1riIZXG4aZ4GTHiPZJPIU= +golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= +golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846 h1:Vve/L0v7CXXuxUmaMGIEK/dEeq7uiqb5qBgQrZzIE7E= golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846/go.mod h1:Sc0INKfu04TlqNoRA1hgpFZbhYXHPr4V5DzpSBTPqQM= +golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc= +golang.org/x/tools v0.14.0/go.mod h1:uYBEerGOWcJyEORxN+Ek8+TT266gXkNlHdJBwexUsBg= gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0= gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= diff --git a/pkg/obiformats/csv_writer.go b/pkg/obiformats/csv_writer.go index 28fe2f6..d055043 100644 --- a/pkg/obiformats/csv_writer.go +++ b/pkg/obiformats/csv_writer.go @@ -10,6 +10,7 @@ import ( "time" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" log "github.com/sirupsen/logrus" @@ -65,7 +66,7 @@ func CSVRecord(sequence *obiseq.BioSequence, opt Options) []string { l := sequence.Len() q := sequence.Qualities() ascii := make([]byte, l) - quality_shift := opt.QualityShift() + quality_shift := obioptions.OutputQualityShift() for j := 0; j < l; j++ { ascii[j] = uint8(q[j]) + uint8(quality_shift) } @@ -130,6 +131,8 @@ func FormatCVSBatch(batch obiiter.BioSequenceBatch, opt Options) []byte { func WriteCSV(iterator obiiter.IBioSequence, file io.WriteCloser, options ...WithOption) (obiiter.IBioSequence, error) { + + var auto_slot obiutils.Set[string] opt := MakeOptions(options) file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile()) @@ -167,12 +170,6 @@ func WriteCSV(iterator obiiter.IBioSequence, newIter.Done() } - log.Debugln("Start of the CSV file writing") - go ff(iterator) - for i := 0; i < nwriters-1; i++ { - go ff(iterator.Split()) - } - next_to_send := 0 received := make(map[int]FileChunck, 100) @@ -203,6 +200,25 @@ func WriteCSV(iterator obiiter.IBioSequence, }() + if opt.pointer.csv_auto { + if iterator.Next() { + batch := iterator.Get() + auto_slot = batch.Slice().AttributeKeys(true) + CSVKeys(auto_slot.Members())(opt) + chunkchan <- FileChunck{ + FormatCVSBatch(batch, opt), + batch.Order(), + } + newIter.Push(batch) + } + } + + log.Debugln("Start of the CSV file writing") + go ff(iterator) + for i := 0; i < nwriters-1; i++ { + go ff(iterator.Split()) + } + return newIter, nil } diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index 9774970..38a5222 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -234,7 +234,7 @@ func ParseFastaChunk(source string, ch FastxChunk) *obiiter.BioSequenceBatch { C = C + 'a' - 'A' } // Removing white space from the sequence - if (C >= 'a' && C <= 'z') || C == '-' || C == '.' { + if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' { ch.Bytes[current] = C current++ } diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index 4b81d05..b6bbc5a 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -329,7 +329,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e parser := func() { defer out.Done() for chk := range chkchan { - seqs := ParseFastqChunk(source, chk, byte(opt.QualityShift())) + seqs := ParseFastqChunk(source, chk, byte(obioptions.InputQualityShift())) if seqs != nil { out.Push(*seqs) } else { diff --git a/pkg/obiformats/fastseq_read.go b/pkg/obiformats/fastseq_read.go index f1e4364..1493181 100644 --- a/pkg/obiformats/fastseq_read.go +++ b/pkg/obiformats/fastseq_read.go @@ -15,6 +15,7 @@ import ( log "github.com/sirupsen/logrus" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" ) @@ -91,7 +92,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe name := C.CString(filename) defer C.free(unsafe.Pointer(name)) - pointer := C.open_fast_sek_file(name, C.int32_t(opt.QualityShift())) + pointer := C.open_fast_sek_file(name, C.int32_t(obioptions.InputQualityShift())) var err error err = nil @@ -150,7 +151,7 @@ func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence { }(newIter) go _FastseqReader(opt.Source(), - C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())), + C.open_fast_sek_stdin(C.int32_t(obioptions.InputQualityShift())), newIter, opt.BatchSize()) log.Debugln("Full file batch mode : ", opt.FullFileBatch()) diff --git a/pkg/obiformats/fastseq_write_fastq.go b/pkg/obiformats/fastseq_write_fastq.go index 7d4ddd4..7eabea1 100644 --- a/pkg/obiformats/fastseq_write_fastq.go +++ b/pkg/obiformats/fastseq_write_fastq.go @@ -11,21 +11,16 @@ import ( log "github.com/sirupsen/logrus" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" ) // The function FormatFastq takes a BioSequence object, a quality shift value, and a header formatter // function as input, and returns a formatted string in FASTQ format. -func FormatFastq(seq *obiseq.BioSequence, quality_shift int, formater FormatHeader) string { +func FormatFastq(seq *obiseq.BioSequence, formater FormatHeader) string { - l := seq.Len() - q := seq.Qualities() - ascii := make([]byte, seq.Len()) - - for j := 0; j < l; j++ { - ascii[j] = uint8(q[j]) + uint8(quality_shift) - } + q := seq.QualitiesString() info := "" if formater != nil { @@ -34,8 +29,8 @@ func FormatFastq(seq *obiseq.BioSequence, quality_shift int, formater FormatHead return fmt.Sprintf("@%s %s\n%s\n+\n%s", seq.Id(), info, - string(seq.Sequence()), - string(ascii), + seq.String(), + q, ) } @@ -44,7 +39,7 @@ func FormatFastqBatch(batch obiiter.BioSequenceBatch, quality_shift int, var bs bytes.Buffer for _, seq := range batch.Slice() { if seq.Len() > 0 { - bs.WriteString(FormatFastq(seq, quality_shift, formater)) + bs.WriteString(FormatFastq(seq, formater)) bs.WriteString("\n") } else { if skipEmpty { @@ -81,7 +76,7 @@ func WriteFastq(iterator obiiter.IBioSequence, chunkchan := make(chan FileChunck) header_format := opt.FormatFastSeqHeader() - quality := opt.QualityShift() + quality := obioptions.OutputQualityShift() newIter.Add(nwriters) diff --git a/pkg/obiformats/options.go b/pkg/obiformats/options.go index 8c08687..b180bb3 100644 --- a/pkg/obiformats/options.go +++ b/pkg/obiformats/options.go @@ -12,7 +12,6 @@ type __options__ struct { buffer_size int batch_size int full_file_batch bool - quality_shift int parallel_workers int closefile bool appendfile bool @@ -27,6 +26,7 @@ type __options__ struct { csv_keys []string csv_separator string csv_navalue string + csv_auto bool paired_filename string source string } @@ -43,7 +43,6 @@ func MakeOptions(setters []WithOption) Options { fastseq_header_writer: FormatFastSeqJsonHeader, with_progress_bar: false, buffer_size: 2, - quality_shift: 33, parallel_workers: obioptions.CLIReadParallelWorkers(), batch_size: obioptions.CLIBatchSize(), full_file_batch: false, @@ -60,6 +59,7 @@ func MakeOptions(setters []WithOption) Options { csv_separator: ",", csv_navalue: "NA", csv_keys: make([]string, 0), + csv_auto: false, paired_filename: "", source: "", } @@ -73,10 +73,6 @@ func MakeOptions(setters []WithOption) Options { return opt } -func (opt Options) QualityShift() int { - return opt.pointer.quality_shift -} - func (opt Options) BatchSize() int { return opt.pointer.batch_size } @@ -153,6 +149,10 @@ func (opt Options) CSVNAValue() string { return opt.pointer.csv_navalue } +func (opt Options) CSVAutoColumn() bool { + return opt.pointer.csv_auto +} + func (opt Options) HaveToSavePaired() bool { return opt.pointer.paired_filename != "" } @@ -217,31 +217,6 @@ func OptionsNewFile() WithOption { return f } -// Allows to specify the ascii code corresponding to -// a quality of 0 in fastq encoded quality scores. -func OptionsQualityShift(shift int) WithOption { - f := WithOption(func(opt Options) { - opt.pointer.quality_shift = shift - }) - - return f -} - -// Allows to specify a quality shift of 33, corresponding -// to a FastQ file qualities encoded following Sanger -// convention. This corresponds to Illumina produced FastQ -// files. -func OptionsQualitySanger() WithOption { - return OptionsQualityShift(33) -} - -// Allows to specify a quality shift of 64, corresponding -// to a FastQ file qualities encoded following the Solexa -// convention. -func OptionsQualitySolexa() WithOption { - return OptionsQualityShift(64) -} - func OptionsFastSeqHeaderParser(parser obiseq.SeqAnnotator) WithOption { f := WithOption(func(opt Options) { opt.pointer.fastseq_header_parser = parser @@ -403,3 +378,11 @@ func CSVNAValue(navalue string) WithOption { return f } + +func CSVAutoColumn(auto bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.csv_auto = auto + }) + + return f +} diff --git a/pkg/obioptions/options.go b/pkg/obioptions/options.go index f368bb2..9115628 100644 --- a/pkg/obioptions/options.go +++ b/pkg/obioptions/options.go @@ -19,6 +19,8 @@ var _ReadWorkerPerCore = 1.0 var _MaxAllowedCPU = runtime.NumCPU() var _BatchSize = 5000 var _Pprof = false +var _Quality_Shift_Input = 33 +var _Quality_Shift_Output = 33 type ArgumentParser func([]string) (*getoptions.GetOpt, []string) @@ -43,6 +45,10 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser options.GetEnv("OBIBATCHSIZE"), options.Description("Number of sequence per batch for paralelle processing")) + options.Bool("solexa", false, + options.GetEnv("OBISOLEXA"), + options.Description("Decodes quality string according to the Solexa specification.")) + for _, o := range optionset { o(options) } @@ -85,6 +91,15 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser } log.Printf("Number of workers set %d", CLIParallelWorkers()) + + if options.Called("workers") { + + } + + if options.Called("solexa") { + SetInputQualityShift(64) + } + return options, remaining } } @@ -144,3 +159,19 @@ func ReadWorkerPerCore() float64 { func SetBatchSize(n int) { _BatchSize = n } + +func InputQualityShift() int { + return _Quality_Shift_Input +} + +func OutputQualityShift() int { + return _Quality_Shift_Output +} + +func SetInputQualityShift(n int) { + _Quality_Shift_Input = n +} + +func SetOutputQualityShift(n int) { + _Quality_Shift_Output = n +} diff --git a/pkg/obiseq/attributes.go b/pkg/obiseq/attributes.go index e5696c4..d1cbda9 100644 --- a/pkg/obiseq/attributes.go +++ b/pkg/obiseq/attributes.go @@ -8,6 +8,48 @@ import ( log "github.com/sirupsen/logrus" ) +// AttributeKeys returns the keys of the attributes in the BioSequence. +// +// It does not take any parameters. +// +// Returns: +// +// []string: The keys of the BioSequence. +func (s *BioSequence) AttributeKeys(skip_map bool) obiutils.Set[string] { + keys := obiutils.MakeSet[string]() + + for k, v := range s.Annotations() { + if !skip_map || !obiutils.IsAMap(v) { + keys.Add(k) + } + } + + return keys +} + +// Keys returns the keys of the BioSequence. +// +// It returns a slice of strings containing the keys of the BioSequence. +// The keys include "id", "sequence", "qualities", and the attribute keys +// of the BioSequence. +// +// Returns: +// +// []string: The keys of the BioSequence. +func (s *BioSequence) Keys(skip_map bool) obiutils.Set[string] { + keys := s.AttributeKeys(skip_map) + keys.Add("id") + + if s.HasSequence() { + keys.Add("sequence") + } + if s.HasQualities() { + keys.Add("qualities") + } + + return keys +} + // HasAttribute checks if the BioSequence has the specified attribute. // // Parameters: @@ -16,6 +58,17 @@ import ( // Returns: // - a boolean indicating whether the BioSequence has the attribute. func (s *BioSequence) HasAttribute(key string) bool { + if key == "id" { + return true + } + + if key == "sequence" && s.sequence != nil { + return true + } + + if key == "qualities" && s.qualities != nil { + return true + } ok := s.annotations != nil if ok { @@ -36,6 +89,25 @@ func (s *BioSequence) HasAttribute(key string) bool { // - val: The value associated with the given key. // - ok: A boolean indicating whether the key exists in the annotations map. func (s *BioSequence) GetAttribute(key string) (interface{}, bool) { + + if key == "id" { + return s.id, true + } + + if key == "sequence" { + if s.HasSequence() { + return s.String(), true + } + return nil, false + } + + if key == "qualities" { + if s.HasQualities() { + return s.QualitiesString(), true + } + return nil, false + } + var val interface{} ok := s.annotations != nil @@ -54,6 +126,17 @@ func (s *BioSequence) GetAttribute(key string) (interface{}, bool) { // - key: the key to set the value for. // - value: the value to set for the given key. func (s *BioSequence) SetAttribute(key string, value interface{}) { + + if key == "id" { + s.SetId(value.(string)) + return + } + + if key == "sequence" { + s.SetSequence(value.([]byte)) + return + } + annot := s.Annotations() defer s.AnnotationsUnlock() diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 5f82658..c89d8d9 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -15,6 +15,7 @@ import ( "sync" "sync/atomic" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" log "github.com/sirupsen/logrus" ) @@ -55,8 +56,7 @@ type Annotation map[string]interface{} // A BioSequence is a sequence of bytes with an identifier, a definition, a sequence, qualities, // features and annotations. It aims to represent a biological sequence type BioSequence struct { - id string // The identidier of the sequence (private accessible through the method Id) - //definition string // The documentation of the sequence (private accessible through the method Definition) + id string // The identidier of the sequence (private accessible through the method Id) source string // The filename without directory name and extension from where the sequence was read. sequence []byte // The sequence itself, it is accessible by the methode Sequence qualities []byte // The quality scores of the sequence. @@ -188,6 +188,14 @@ func (s *BioSequence) Definition() string { return definition } +// HasSequence checks if the BioSequence has a sequence. +// +// No parameters. +// Returns a boolean. +func (s *BioSequence) HasSequence() bool { + return s.sequence != nil && len(s.sequence) > 0 +} + // Sequence returns the sequence of the BioSequence. // // Returns: @@ -217,7 +225,7 @@ func (s *BioSequence) Len() int { // This function does not have any parameters. // It returns a boolean value indicating whether the BioSequence has qualities. func (s *BioSequence) HasQualities() bool { - return len(s.qualities) > 0 + return s.qualities != nil && len(s.qualities) > 0 } // Qualities returns the sequence quality scores of the BioSequence. @@ -235,6 +243,19 @@ func (s *BioSequence) Qualities() Quality { return __make_default_qualities__(len(s.sequence)) } +// QualitiesString returns the string representation of the qualities of the BioSequence. +// +// Returns a string representing the qualities of the BioSequence after applying the shift. +func (s *BioSequence) QualitiesString() string { + quality_shift := obioptions.OutputQualityShift() + qual := s.Qualities() + qual_ascii := make([]byte, len(qual)) + for i := 0; i < len(qual); i++ { + qual_ascii[i] = byte(qual[i] + byte(quality_shift)) + } + return string(qual_ascii) +} + // Features returns the feature string of the BioSequence. // // The feature string contains the EMBL/GenBank not parsed feature table diff --git a/pkg/obiseq/biosequenceslice.go b/pkg/obiseq/biosequenceslice.go index 1ae2ee7..898018a 100644 --- a/pkg/obiseq/biosequenceslice.go +++ b/pkg/obiseq/biosequenceslice.go @@ -3,6 +3,7 @@ package obiseq import ( "sync" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" log "github.com/sirupsen/logrus" "golang.org/x/exp/slices" ) @@ -171,3 +172,13 @@ func (s BioSequenceSlice) Size() int { return size } + +func (s BioSequenceSlice) AttributeKeys(skip_map bool) obiutils.Set[string] { + keys := obiutils.MakeSet[string]() + + for _, k := range s { + keys = keys.Union(k.AttributeKeys(skip_map)) + } + + return keys +} diff --git a/pkg/obiseq/language.go b/pkg/obiseq/language.go index 3c76c70..f718043 100644 --- a/pkg/obiseq/language.go +++ b/pkg/obiseq/language.go @@ -198,6 +198,15 @@ var OBILang = gval.NewLanguage( composition := (args[0].(*BioSequence)).Composition() return float64(composition['g']-composition['c']) / float64(composition['g']+composition['c']), nil }), + gval.Function("gc", func(args ...interface{}) (interface{}, error) { + composition := (args[0].(*BioSequence)).Composition() + return float64(composition['g']+composition['c']) / float64(args[0].(*BioSequence).Len()), nil + }), gval.Function("composition", func(args ...interface{}) (interface{}, error) { - return (args[0].(*BioSequence)).Composition(), nil + comp := (args[0].(*BioSequence)).Composition() + scomp := make(map[string]float64) + for k, v := range comp { + scomp[string(k)] = float64(v) + } + return scomp, nil })) diff --git a/pkg/obiseq/revcomp.go b/pkg/obiseq/revcomp.go index e23bd50..9ff0ca0 100644 --- a/pkg/obiseq/revcomp.go +++ b/pkg/obiseq/revcomp.go @@ -1,7 +1,17 @@ package obiseq // ".ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]" -var _revcmpDNA = []byte(".TVGHEFCDIJMLKNOPQYSAABWXRZ#!][") +var _revcmpDNA = []byte(".TVGHNNCDNNMNKNNNNYSAABWNRN]N[NNN") + +func complement(n byte) byte { + switch { + case n == '.' || n == '-': + return n + case (n >= 'A' && n <= 'z'): + return _revcmpDNA[n&31] | (n & 0x20) + } + return 'n' +} // Reverse complements a DNA sequence. // If the inplace parametter is true, that operation is done in place. @@ -18,8 +28,7 @@ func (sequence *BioSequence) ReverseComplement(inplace bool) *BioSequence { // ASCII code & 31 -> builds an index in witch (a|A) is 1 // ASCII code & 0x20 -> Foce lower case - s[j], s[i] = _revcmpDNA[s[i]&31]|(s[i]&0x20), - _revcmpDNA[s[j]&31]|(s[j]&0x20) + s[j], s[i] = complement(s[i]), complement(s[j]) j++ } @@ -40,8 +49,7 @@ func (sequence *BioSequence) _revcmpMutation() *BioSequence { b := []byte(m) // Echange and reverse complement symboles - b[1], b[9] = _revcmpDNA[b[9]&31]|(b[9]&0x20), - _revcmpDNA[b[1]&31]|(b[1]&0x20) + b[1], b[9] = complement(b[9]), complement(b[1]) // Exchange sequencing scores b[3], b[4], b[11], b[12] = b[11], b[12], b[3], b[4] @@ -65,7 +73,6 @@ func (sequence *BioSequence) _revcmpMutation() *BioSequence { return sequence } - func ReverseComplementWorker(inplace bool) SeqWorker { f := func(input *BioSequence) *BioSequence { return input.ReverseComplement(inplace) diff --git a/pkg/obitools/obiconvert/options.go b/pkg/obitools/obiconvert/options.go index 13463be..91e5065 100644 --- a/pkg/obitools/obiconvert/options.go +++ b/pkg/obitools/obiconvert/options.go @@ -16,13 +16,10 @@ var __input_ecopcr_format__ = false var __input_embl_format__ = false var __input_genbank_format__ = false -var __input_solexa_quality__ = false - var __output_in_fasta__ = false var __output_in_fastq__ = false var __output_fastjson_format__ = false var __output_fastobi_format__ = false -var __output_solexa_quality__ = false var __no_progress_bar__ = false var __compressed__ = false @@ -54,9 +51,6 @@ func InputOptionSet(options *getoptions.GetOpt) { options.BoolVar(&__input_genbank_format__, "genbank", __input_genbank_format__, options.Description("Read data following the Genbank flatfile format.")) - options.BoolVar(&__input_solexa_quality__, "solexa", __input_solexa_quality__, - options.Description("Decodes quality string according to the Solexa specification.")) - options.BoolVar(&__no_ordered_input__, "no-order", __no_ordered_input__, options.Description("When several input files are provided, "+ "indicates that there is no order among them.")) @@ -71,7 +65,7 @@ func OutputModeOptionSet(options *getoptions.GetOpt) { options.Alias("Z"), options.Description("Output is compressed")) - options.BoolVar(&__skip_empty__, "skip-empty", __skip_empty__, + options.BoolVar(&__skip_empty__, "skip-empty", __skip_empty__, options.Description("Sequences of length equal to zero are suppressed from the output")) options.StringVar(&__output_file_name__, "out", __output_file_name__, @@ -146,7 +140,7 @@ func CLICompressed() bool { } func CLISkipEmpty() bool { - return __skip_empty__ + return __skip_empty__ } func CLIInputFastHeaderFormat() string { @@ -181,22 +175,6 @@ func CLIAnalyzeOnly() int { return __read_only_entries__ } -func CLIInputQualityShift() int { - if __input_solexa_quality__ { - return 64 - } else { - return 33 - } -} - -func CLIOutputQualityShift() int { - if __output_solexa_quality__ { - return 64 - } else { - return 33 - } -} - func CLIProgressBar() bool { return !__no_progress_bar__ } @@ -217,4 +195,4 @@ func SetFullFileBatch() { } func FullFileBatch() bool { return __full_file_batch__ -} \ No newline at end of file +} diff --git a/pkg/obitools/obiconvert/sequence_reader.go b/pkg/obitools/obiconvert/sequence_reader.go index a37d482..fafc9ba 100644 --- a/pkg/obitools/obiconvert/sequence_reader.go +++ b/pkg/obitools/obiconvert/sequence_reader.go @@ -98,10 +98,8 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) { opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize())) - opts = append(opts, obiformats.OptionsQualityShift(CLIInputQualityShift())) opts = append(opts, obiformats.OptionsFullFileBatch(FullFileBatch())) - if len(filenames) == 0 { log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat()) opts = append(opts, obiformats.OptionsSource("stdin")) diff --git a/pkg/obitools/obiconvert/sequence_writer.go b/pkg/obitools/obiconvert/sequence_writer.go index 0756656..1e1e7ef 100644 --- a/pkg/obitools/obiconvert/sequence_writer.go +++ b/pkg/obitools/obiconvert/sequence_writer.go @@ -59,8 +59,6 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence, opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize())) - opts = append(opts, obiformats.OptionsQualityShift(CLIOutputQualityShift())) - opts = append(opts, obiformats.OptionsCompressed(CLICompressed())) var err error diff --git a/pkg/obitools/obicsv/obicsv.go b/pkg/obitools/obicsv/obicsv.go index 3cb7aa5..ff85013 100644 --- a/pkg/obitools/obicsv/obicsv.go +++ b/pkg/obitools/obicsv/obicsv.go @@ -27,8 +27,6 @@ func CLIWriteCSV(iterator obiiter.IBioSequence, opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize())) - - opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift())) opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed())) opts = append(opts, obiformats.CSVId(CLIPrintId()), @@ -37,6 +35,7 @@ func CLIWriteCSV(iterator obiiter.IBioSequence, obiformats.CSVDefinition(CLIPrintDefinition()), obiformats.CSVKeys(CLIToBeKeptAttributes()), obiformats.CSVSequence(CLIPrintSequence()), + obiformats.CSVAutoColumn(CLIAutoColumns()), ) var err error diff --git a/pkg/obitools/obidistribute/distribute.go b/pkg/obitools/obidistribute/distribute.go index beb5a9c..aa95472 100644 --- a/pkg/obitools/obidistribute/distribute.go +++ b/pkg/obitools/obidistribute/distribute.go @@ -32,7 +32,6 @@ func DistributeSequence(sequences obiiter.IBioSequence) { opts = append(opts, obiformats.OptionsParallelWorkers(nworkers), obiformats.OptionsBatchSize(obioptions.CLIBatchSize()), - obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()), obiformats.OptionsAppendFile(CLIAppendSequences()), obiformats.OptionsCompressed(obiconvert.CLICompressed())) diff --git a/pkg/obitools/obitag/options.go b/pkg/obitools/obitag/options.go index 349d1aa..5e3c05b 100644 --- a/pkg/obitools/obitag/options.go +++ b/pkg/obitools/obitag/options.go @@ -93,8 +93,6 @@ func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) { opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize())) - opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift())) - opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed())) var err error