diff --git a/go.mod b/go.mod index 0647567..fd77a09 100644 --- a/go.mod +++ b/go.mod @@ -20,15 +20,28 @@ require ( scientificgo.org/special v0.0.0 ) +require ( + github.com/kr/pretty v0.2.1 // indirect + github.com/kr/text v0.1.0 // indirect +) + require ( github.com/davecgh/go-spew v1.1.1 // indirect + github.com/dsnet/compress v0.0.1 + github.com/gabriel-vasile/mimetype v1.4.2 // indirect github.com/klauspost/compress v1.16.7 // indirect github.com/mattn/go-runewidth v0.0.15 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rivo/uniseg v0.4.4 // indirect github.com/shopspring/decimal v1.3.1 // indirect + github.com/ulikunitz/xz v0.5.11 + github.com/yuin/goldmark v1.4.13 // indirect + golang.org/x/mod v0.12.0 // indirect + golang.org/x/net v0.14.0 // indirect golang.org/x/sys v0.11.0 // indirect golang.org/x/term v0.11.0 // indirect + golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846 // indirect + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect ) diff --git a/go.sum b/go.sum index 4ff6bda..41309d0 100644 --- a/go.sum +++ b/go.sum @@ -13,13 +13,25 @@ github.com/daichi-m/go18ds v1.12.1/go.mod h1:wc2dURUr8aMxxC4Mn5ObJGVM7uIKU8JagY4 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q= +github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo= +github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= +github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU= +github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA= github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= +github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A= github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= +github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU= github.com/klauspost/pgzip v1.2.6/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= +github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= @@ -45,8 +57,17 @@ github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5Cc github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/tevino/abool/v2 v2.1.0 h1:7w+Vf9f/5gmKT4m4qkayb33/92M+Um45F2BkHOR+L/c= github.com/tevino/abool/v2 v2.1.0/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY= +github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8= +github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8= +github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= +github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8= +golang.org/x/mod v0.12.0 h1:rmsUpXtvNzj340zd98LZ4KntptpfRHwpFOHG188oHXc= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.14.0 h1:BONx9s002vGdD9umnlX1Po8vOZmrgH34qlHcD1MfK14= +golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -55,10 +76,14 @@ golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.11.0 h1:F9tnn/DA/Im8nCwm+fX+1/eBwi4qFjRT++MhtVC4ZX0= golang.org/x/term v0.11.0/go.mod h1:zC9APTIj3jG3FdV/Ons+XE1riIZXG4aZ4GTHiPZJPIU= +golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846 h1:Vve/L0v7CXXuxUmaMGIEK/dEeq7uiqb5qBgQrZzIE7E= +golang.org/x/tools v0.12.1-0.20230815132531-74c255bcf846/go.mod h1:Sc0INKfu04TlqNoRA1hgpFZbhYXHPr4V5DzpSBTPqQM= gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0= gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= scientificgo.org/special v0.0.0 h1:P6WJkECo6tgtvZAEfNXl+KEB9ReAatjKAeX8U07mjSc= diff --git a/pkg/obiformats/ecopcr_read.go b/pkg/obiformats/ecopcr_read.go index 1dae062..2b1ef0f 100644 --- a/pkg/obiformats/ecopcr_read.go +++ b/pkg/obiformats/ecopcr_read.go @@ -3,13 +3,14 @@ package obiformats import ( "encoding/csv" "fmt" - gzip "github.com/klauspost/pgzip" "io" "os" "path" "strconv" "strings" + gzip "github.com/klauspost/pgzip" + log "github.com/sirupsen/logrus" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" @@ -209,7 +210,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence { }() if opt.pointer.full_file_batch { - newIter = newIter.FullFileIterator() + newIter = newIter.CompleteFileIterator() } return newIter diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index 73db113..3b9f7c2 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -266,7 +266,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence { go _ReadFlatFileChunk(reader, entry_channel) if opt.pointer.full_file_batch { - newIter = newIter.FullFileIterator() + newIter = newIter.CompleteFileIterator() } return newIter diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go new file mode 100644 index 0000000..eb6e9cf --- /dev/null +++ b/pkg/obiformats/fastaseq_read.go @@ -0,0 +1,322 @@ +package obiformats + +import ( + "bytes" + "fmt" + "io" + "os" + "path" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" + "golang.org/x/exp/slices" + + log "github.com/sirupsen/logrus" +) + +// lastSequenceCut extracts the up to the last sequence cut from a given buffer. +// +// It takes a parameter: +// - buffer []byte: the buffer to extract the sequence cut from. +// +// It returns two values: +// - []byte: the extracted sequences. +// - []byte: the remaining buffer after the sequence cut (the last sequence). +func lastSequenceCut(buffer []byte) ([]byte, []byte) { + imax := len(buffer) + last := 0 + state := 0 + for i := imax - 1; i >= 0 && state < 2; i-- { + if state == 0 && buffer[i] == '>' { + state = 1 + last = i + } else if state == 1 && (buffer[i] == '\r' || buffer[i] == '\n') { + state = 2 + } else { + state = 0 + } + } + + if state == 2 { + return buffer[:last], bytes.Clone(buffer[last:]) + } + return []byte{}, buffer +} + +// firstSequenceCut cuts the input buffer at the first occurrence of a ">" character +// following a sequence of "\r" or "\n" characters. +// +// It takes a byte slice as input, representing the buffer to be cut. +// It returns two byte slices: the first slice contains the part of the buffer before the cut, +// and the second slice contains the part of the buffer after the cut. +func firstSequenceCut(buffer []byte) ([]byte, []byte) { + imax := len(buffer) + last := 0 + state := 0 + for i := 0; i < imax && state < 2; i++ { + if (state == 0 || state == 1) && (buffer[i] == '\r' || buffer[i] == '\n') { + state = 1 + } else if (state == 1 || i == 0) && buffer[i] == '>' { + state = 2 + last = i + } else { + state = 0 + } + } + + if state == 2 { + return bytes.Clone(buffer[:last]), buffer[last:] + } + return buffer, []byte{} + +} + +func fullSequenceCut(buffer []byte) ([]byte, []byte, []byte) { + before, buffer := firstSequenceCut(buffer) + + if len(buffer) == 0 { + return before, []byte{}, []byte{} + } + + buffer, after := lastSequenceCut(buffer) + return before, buffer, after +} + +func Concatenate[S ~[]E, E any](s1, s2 S) S { + if len(s1) > 0 { + if len(s2) > 0 { + return append(s1[:len(s1):len(s1)], s2...) + } + return s1 + } + return s2 +} + +type FastxChunk struct { + Bytes []byte + index int +} + +func FastaChunkReader(r io.Reader, size int, cutHead bool) (chan FastxChunk, error) { + out := make(chan FastxChunk) + buff := make([]byte, size) + + n, err := r.Read(buff) + if n > 0 && err == nil { + if n < size { + buff = buff[:n] + } + + begin, buff := firstSequenceCut(buff) + + if len(begin) > 0 && !cutHead { + return out, fmt.Errorf("begin is not empty : %s", string(begin)) + } + + go func(buff []byte) { + idx := 0 + end := []byte{} + + for err == nil && n > 0 { + // fmt.Println("============end=========================") + // fmt.Println(string(end)) + // fmt.Println("------------buff------------------------") + // fmt.Println(string(buff)) + buff = Concatenate(end, buff) + // fmt.Println("------------buff--pasted----------------") + // fmt.Println(string(buff)) + buff, end = lastSequenceCut(buff) + // fmt.Println("----------------buff--cutted------------") + // fmt.Println(string(buff)) + // fmt.Println("------------------end-------------------") + // fmt.Println(string(end)) + // fmt.Println("========================================") + if len(buff) > 0 { + out <- FastxChunk{ + Bytes: bytes.Clone(buff), + index: idx, + } + idx++ + } + + buff = slices.Grow(buff[:0], size)[0:size] + n, err = r.Read(buff) + if n < size { + buff = buff[:n] + } + // fmt.Printf("n = %d, err = %v\n", n, err) + } + + if len(end) > 0 { + out <- FastxChunk{ + Bytes: bytes.Clone(end), + index: idx, + } + } + + close(out) + }(buff) + } + + return out, nil +} + +func ParseFastaChunk(source string, ch FastxChunk) *obiiter.BioSequenceBatch { + slice := make(obiseq.BioSequenceSlice, 0, obioptions.CLIBatchSize()) + + state := 0 + start := 0 + current := 0 + var identifier string + var definition string + + for i := 0; i < len(ch.Bytes); i++ { + C := ch.Bytes[i] + is_end_of_line := C == '\r' || C == '\n' + is_space := C == ' ' || C == '\t' + is_sep := is_space || is_end_of_line + + switch state { + case 0: + if C == '>' { + // Beginning of sequence + state = 1 + } + case 1: + if is_sep { + // No identifier -> ERROR + return nil + } else { + // Beginning of identifier + state = 2 + start = i + } + case 2: + if is_sep { + // End of identifier + identifier = string(ch.Bytes[start:i]) + state = 3 + } + case 3: + if is_end_of_line { + // Definition empty + definition = "" + state = 5 + } else if !is_space { + // Beginning of definition + start = i + state = 4 + } + case 4: + if is_end_of_line { + definition = string(ch.Bytes[start:i]) + state = 5 + + } + case 5: + if !is_end_of_line { + // Beginning of sequence + start = i + current = i + 1 + state = 6 + } + case 6: + if C == '>' { + // End of sequence + s := obiseq.NewBioSequence(identifier, bytes.Clone(ch.Bytes[start:current]), definition) + s.SetSource(source) + slice = append(slice, s) + state = 1 + + } else if !is_sep { + if C >= 'A' && C <= 'Z' { + C = C + 'a' - 'A' + } + // Removing white space from the sequence + if (C >= 'a' && C <= 'z') || C == '-' || C == '.' { + ch.Bytes[current] = C + current++ + } + } + } + } + + slice = append(slice, obiseq.NewBioSequence(identifier, bytes.Clone(ch.Bytes[start:current]), definition)) + batch := obiiter.MakeBioSequenceBatch(ch.index, slice) + return &batch +} + +func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { + opt := MakeOptions(options) + out := obiiter.MakeIBioSequence() + + source := opt.Source() + + nworker := obioptions.CLIReadParallelWorkers() + out.Add(nworker) + + chkchan, err := FastaChunkReader(reader, 1024*500, false) + + if err != nil { + return obiiter.NilIBioSequence, err + } + + go func() { + out.WaitAndClose() + }() + + parser := func() { + defer out.Done() + for chk := range chkchan { + seqs := ParseFastaChunk(source, chk) + if seqs != nil { + out.Push(*seqs) + } + } + } + + for i := 0; i < nworker; i++ { + go parser() + } + + newIter := out.SortBatches().Rebatch(opt.BatchSize()) + + log.Debugln("Full file batch mode : ", opt.FullFileBatch()) + if opt.FullFileBatch() { + newIter = newIter.CompleteFileIterator() + } + + annotParser := opt.ParseFastSeqHeader() + + if annotParser != nil { + return IParseFastSeqHeaderBatch(newIter, options...), nil + } + + return newIter, nil +} + +func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { + options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) + + file, err := Ropen(filename) + + if err != nil { + return obiiter.NilIBioSequence, err + } + + return ReadFasta(file, options...) +} + +func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { + options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin"))) + input, err := Buf(os.Stdin) + + if err != nil { + log.Fatalf("open file error: %v", err) + return obiiter.NilIBioSequence, err + } + + return ReadFasta(input, options...) +} diff --git a/pkg/obiformats/fastseq_read.go b/pkg/obiformats/fastseq_read.go index e18a3b5..f1e4364 100644 --- a/pkg/obiformats/fastseq_read.go +++ b/pkg/obiformats/fastseq_read.go @@ -124,7 +124,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe log.Debugln("Full file batch mode : ", opt.FullFileBatch()) if opt.FullFileBatch() { - newIter = newIter.FullFileIterator() + newIter = newIter.CompleteFileIterator() } parser := opt.ParseFastSeqHeader() @@ -155,7 +155,7 @@ func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence { log.Debugln("Full file batch mode : ", opt.FullFileBatch()) if opt.FullFileBatch() { - newIter = newIter.FullFileIterator() + newIter = newIter.CompleteFileIterator() } parser := opt.ParseFastSeqHeader() diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index dabaa5a..40b3b1c 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -34,14 +34,14 @@ var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp") func _ParseGenbankFile(source string, input <-chan _FileChunk, out obiiter.IBioSequence, chunck_order func() int) { - var err error + var err error state := inHeader for chunks := range input { // log.Debugln("Chunk size", (chunks.raw.(*bytes.Buffer)).Len()) scanner := bufio.NewScanner(chunks.raw) sequences := make(obiseq.BioSequenceSlice, 0, 100) - sumlength:=0 + sumlength := 0 id := "" lseq := -1 scientificName := "" @@ -61,12 +61,12 @@ func _ParseGenbankFile(source string, case strings.HasPrefix(line, "LOCUS "): state = inEntry id = strings.SplitN(line[12:], " ", 2)[0] - match_length := _seqlenght_rx.FindStringSubmatch(line) + match_length := _seqlenght_rx.FindStringSubmatch(line) if len(match_length) > 0 { - lseq,err = strconv.Atoi(match_length[1]) + lseq, err = strconv.Atoi(match_length[1]) if err != nil { lseq = -1 - } + } } if lseq > 0 { seqBytes = bytes.NewBuffer(obiseq.GetSlice(lseq + 20)) @@ -101,7 +101,7 @@ func _ParseGenbankFile(source string, // sequence.Len(), seqBytes.Len()) sequences = append(sequences, sequence) - sumlength+=sequence.Len() + sumlength += sequence.Len() if len(sequences) == 100 || sumlength > 1e7 { out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) @@ -137,7 +137,7 @@ func _ParseGenbankFile(source string, if len(sequences) > 0 { out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) } -} + } out.Done() @@ -159,13 +159,13 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence { // for j := 0; j < opt.ParallelWorkers(); j++ { for j := 0; j < nworkers; j++ { - go _ParseGenbankFile(opt.Source(), entry_channel, newIter,chunck_order) + go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order) } go _ReadFlatFileChunk(reader, entry_channel) if opt.pointer.full_file_batch { - newIter = newIter.FullFileIterator() + newIter = newIter.CompleteFileIterator() } return newIter diff --git a/pkg/obiformats/options.go b/pkg/obiformats/options.go index 6b1d72e..8c08687 100644 --- a/pkg/obiformats/options.go +++ b/pkg/obiformats/options.go @@ -1,6 +1,7 @@ package obiformats import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) @@ -43,8 +44,8 @@ func MakeOptions(setters []WithOption) Options { with_progress_bar: false, buffer_size: 2, quality_shift: 33, - parallel_workers: 4, - batch_size: 5000, + parallel_workers: obioptions.CLIReadParallelWorkers(), + batch_size: obioptions.CLIBatchSize(), full_file_batch: false, closefile: false, appendfile: false, diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go index f63b3fb..84ca323 100644 --- a/pkg/obiformats/universal_read.go +++ b/pkg/obiformats/universal_read.go @@ -4,12 +4,10 @@ import ( "bufio" "bytes" "io" - "os" "path" "regexp" "github.com/gabriel-vasile/mimetype" - gzip "github.com/klauspost/pgzip" log "github.com/sirupsen/logrus" @@ -91,6 +89,36 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) { return mimeType, newReader, nil } +// func ReadSequences(reader io.Reader, +// options ...WithOption) (obiiter.IBioSequence, error) { + +// mime, reader, err := OBIMimeTypeGuesser(reader) + +// if err != nil { +// return obiiter.NilIBioSequence, err +// } + +// reader = bufio.NewReader(reader) + +// switch mime.String() { +// case "text/fasta", "text/fastq": +// file.Close() +// is, err := ReadFastSeqFromFile(filename, options...) +// return is, err +// case "text/ecopcr2": +// return ReadEcoPCR(reader, options...), nil +// case "text/embl": +// return ReadEMBL(reader, options...), nil +// case "text/genbank": +// return ReadGenbank(reader, options...), nil +// default: +// log.Fatalf("File %s has guessed format %s which is not yet implemented", +// filename, mime.String()) +// } + +// return obiiter.NilIBioSequence, nil +// } + // ReadSequencesFromFile reads sequences from a file and returns an iterator of bio sequences and an error. // // Parameters: @@ -102,32 +130,20 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) { // - error: An error if any occurred during the reading process. func ReadSequencesFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { - var file *os.File + var file *Reader var reader io.Reader - var greader io.Reader var err error options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename))))) - file, err = os.Open(filename) + file, err = Ropen(filename) if err != nil { log.Fatalf("open file error: %v", err) return obiiter.NilIBioSequence, err } - reader = file - - // Test if the flux is compressed by gzip - greader, err = gzip.NewReader(reader) - if err != nil { - file.Seek(0, 0) - } else { - log.Debugf("File %s is gz compressed ", filename) - reader = greader - } - - mime, reader, err := OBIMimeTypeGuesser(reader) + mime, reader, err := OBIMimeTypeGuesser(file) if err != nil { return obiiter.NilIBioSequence, err @@ -136,10 +152,12 @@ func ReadSequencesFromFile(filename string, reader = bufio.NewReader(reader) switch mime.String() { - case "text/fasta", "text/fastq": + case "text/fastq": file.Close() is, err := ReadFastSeqFromFile(filename, options...) return is, err + case "text/fasta": + return ReadFasta(reader, options...) case "text/ecopcr2": return ReadEcoPCR(reader, options...), nil case "text/embl": @@ -153,3 +171,9 @@ func ReadSequencesFromFile(filename string, return obiiter.NilIBioSequence, nil } + +// func ReadSequencesFromStdin(options ...WithOption) obiiter.IBioSequence { + +// options = append(options, OptionsSource("stdin")) + +// } diff --git a/pkg/obiformats/xopen.go b/pkg/obiformats/xopen.go new file mode 100644 index 0000000..a74140f --- /dev/null +++ b/pkg/obiformats/xopen.go @@ -0,0 +1,437 @@ +// This is an integration of the xopen package originally written by Brent Pedersen +// (https://github.com/brentp/xopen). +// +// Here it can be considered as a fork of [Wei Shen](http://shenwei.me) the version : +// +// https://github.com/shenwei356/xopen +// +// Package xopen makes it easy to get buffered readers and writers. +// Ropen opens a (possibly gzipped) file/process/http site for buffered reading. +// Wopen opens a (possibly gzipped) file for buffered writing. +// Both will use gzip when appropriate and will user buffered IO. +package obiformats + +import ( + "bufio" + "errors" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "os/user" + "path/filepath" + "strings" + + "github.com/dsnet/compress/bzip2" + "github.com/klauspost/compress/zstd" + gzip "github.com/klauspost/pgzip" + "github.com/ulikunitz/xz" +) + +// Level is the default compression level of gzip. +// This value will be automatically adjusted to the default value of zstd or bzip2. +var Level = gzip.DefaultCompression + +// ErrNoContent means nothing in the stream/file. +var ErrNoContent = errors.New("xopen: no content") + +// ErrDirNotSupported means the path is a directory. +var ErrDirNotSupported = errors.New("xopen: input is a directory") + +// IsGzip returns true buffered Reader has the gzip magic. +func IsGzip(b *bufio.Reader) (bool, error) { + return CheckBytes(b, []byte{0x1f, 0x8b}) +} + +// IsXz returns true buffered Reader has the xz magic. +func IsXz(b *bufio.Reader) (bool, error) { + return CheckBytes(b, []byte{0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00}) +} + +// IsZst returns true buffered Reader has the zstd magic. +func IsZst(b *bufio.Reader) (bool, error) { + return CheckBytes(b, []byte{0x28, 0xB5, 0x2f, 0xfd}) +} + +// IsBzip2 returns true buffered Reader has the bzip2 magic. +func IsBzip2(b *bufio.Reader) (bool, error) { + return CheckBytes(b, []byte{0x42, 0x5a, 0x68}) +} + +// IsStdin checks if we are getting data from stdin. +func IsStdin() bool { + // http://stackoverflow.com/a/26567513 + stat, err := os.Stdin.Stat() + if err != nil { + return false + } + return (stat.Mode() & os.ModeCharDevice) == 0 +} + +// ExpandUser expands ~/path and ~otheruser/path appropriately +func ExpandUser(path string) (string, error) { + if len(path) == 0 || path[0] != '~' { + return path, nil + } + var u *user.User + var err error + if len(path) == 1 || path[1] == '/' { + u, err = user.Current() + } else { + name := strings.Split(path[1:], "/")[0] + u, err = user.Lookup(name) + } + if err != nil { + return "", err + } + home := u.HomeDir + path = home + "/" + path[1:] + return path, nil +} + +// Exists checks if a local file exits +func Exists(path string) bool { + path, perr := ExpandUser(path) + if perr != nil { + return false + } + _, err := os.Stat(path) + return err == nil +} + +// CheckBytes peeks at a buffered stream and checks if the first read bytes match. +func CheckBytes(b *bufio.Reader, buf []byte) (bool, error) { + + m, err := b.Peek(len(buf)) + if err != nil { + // return false, ErrNoContent + return false, err // EOF + } + for i := range buf { + if m[i] != buf[i] { + return false, nil + } + } + return true, nil +} + +// Reader is returned by Ropen +type Reader struct { + *bufio.Reader + rdr io.Reader + gz io.ReadCloser +} + +// Close the associated files. +func (r *Reader) Close() error { + var err error + if r.gz != nil { + err = r.gz.Close() + if err != nil { + return err + } + } + if c, ok := r.rdr.(io.ReadCloser); ok { + err = c.Close() + if err != nil { + return err + } + } + return nil +} + +// Writer is returned by Wopen +type Writer struct { + *bufio.Writer + wtr *os.File + gz *gzip.Writer + xw *xz.Writer + zw *zstd.Encoder + bz2 *bzip2.Writer +} + +// Close the associated files. +func (w *Writer) Close() error { + var err error + err = w.Flush() + if err != nil { + return err + } + + if w.gz != nil { + err = w.gz.Close() + if err != nil { + return err + } + } + if w.xw != nil { + err = w.xw.Close() + if err != nil { + return err + } + } + if w.zw != nil { + err = w.zw.Close() + if err != nil { + return err + } + } + if w.bz2 != nil { + err = w.bz2.Close() + if err != nil { + return err + } + } + return w.wtr.Close() +} + +// Flush the writer. +func (w *Writer) Flush() error { + var err error + err = w.Writer.Flush() + if err != nil { + return err + } + + if w.gz != nil { + err = w.gz.Flush() + if err != nil { + return err + } + } + if w.zw != nil { + err = w.zw.Flush() + if err != nil { + return err + } + } + return nil +} + +var bufSize = 65536 + +// Buf returns a buffered reader from an io.Reader +// If f == "-", then it will attempt to read from os.Stdin. +// If the file is gzipped, it will be read as such. +func Buf(r io.Reader) (*Reader, error) { + b := bufio.NewReaderSize(r, bufSize) + var rd io.Reader + var rdr io.ReadCloser + + if is, err := IsGzip(b); err != nil { + // check BOM + t, _, err := b.ReadRune() // no content + if err != nil { + return nil, ErrNoContent + } + if t != '\uFEFF' { + b.UnreadRune() + } + return &Reader{b, r, rdr}, nil // non-gzip file with content less than 2 bytes + } else if is { + rdr, err = gzip.NewReader(b) + if err != nil { + return nil, err + } + b = bufio.NewReaderSize(rdr, bufSize) + } else if is, err := IsZst(b); err != nil { + // check BOM + t, _, err := b.ReadRune() // no content + if err != nil { + return nil, ErrNoContent + } + if t != '\uFEFF' { + b.UnreadRune() + } + return &Reader{b, r, rdr}, nil // non-gzip/zst file with content less than 4 bytes + } else if is { + rd, err = zstd.NewReader(b) + if err != nil { + return nil, err + } + b = bufio.NewReaderSize(rd, bufSize) + } else if is, err := IsXz(b); err != nil { + // check BOM + t, _, err := b.ReadRune() // no content + if err != nil { + return nil, ErrNoContent + } + if t != '\uFEFF' { + b.UnreadRune() + } + return &Reader{b, r, rdr}, nil // non-gzip/zst/xz file with content less than 6 bytes + } else if is { + rd, err = xz.NewReader(b) + if err != nil { + return nil, err + } + b = bufio.NewReaderSize(rd, bufSize) + } else if is, err := IsBzip2(b); err != nil { + // check BOM + t, _, err := b.ReadRune() // no content + if err != nil { + return nil, ErrNoContent + } + if t != '\uFEFF' { + b.UnreadRune() + } + return &Reader{b, r, rdr}, nil // non-gzip/zst/xz file with content less than 6 bytes + } else if is { + rd, err = bzip2.NewReader(b, &bzip2.ReaderConfig{}) + if err != nil { + return nil, err + } + b = bufio.NewReaderSize(rd, bufSize) + } + + // other files with content >= 6 bytes + + // check BOM + t, _, err := b.ReadRune() + if err != nil { + return nil, ErrNoContent + } + if t != '\uFEFF' { + b.UnreadRune() + } + return &Reader{b, r, rdr}, nil +} + +// XReader returns a reader from a url string or a file. +func XReader(f string) (io.Reader, error) { + if strings.HasPrefix(f, "http://") || strings.HasPrefix(f, "https://") { + var rsp *http.Response + rsp, err := http.Get(f) + if err != nil { + return nil, err + } + if rsp.StatusCode != 200 { + return nil, fmt.Errorf("http error downloading %s. status: %s", f, rsp.Status) + } + rdr := rsp.Body + return rdr, nil + } + f, err := ExpandUser(f) + if err != nil { + return nil, err + } + + fi, err := os.Stat(f) + if err != nil { + return nil, err + } + if fi.IsDir() { + return nil, ErrDirNotSupported + } + + return os.Open(f) +} + +// Ropen opens a buffered reader. +func Ropen(f string) (*Reader, error) { + var err error + var rdr io.Reader + if f == "-" { + if !IsStdin() { + return nil, errors.New("stdin not detected") + } + b, err := Buf(os.Stdin) + return b, err + } else if f[0] == '|' { + // TODO: use csv to handle quoted file names. + cmdStrs := strings.Split(f[1:], " ") + var cmd *exec.Cmd + if len(cmdStrs) == 2 { + cmd = exec.Command(cmdStrs[0], cmdStrs[1:]...) + } else { + cmd = exec.Command(cmdStrs[0]) + } + rdr, err = cmd.StdoutPipe() + if err != nil { + return nil, err + } + err = cmd.Start() + if err != nil { + return nil, err + } + } else { + rdr, err = XReader(f) + } + if err != nil { + return nil, err + } + b, err := Buf(rdr) + return b, err +} + +// Wopen opens a buffered reader. +// If f == "-", then stdout will be used. +// If f endswith ".gz", then the output will be gzipped. +// If f endswith ".xz", then the output will be zx-compressed. +// If f endswith ".zst", then the output will be zstd-compressed. +// If f endswith ".bz2", then the output will be bzip2-compressed. +func Wopen(f string) (*Writer, error) { + return WopenFile(f, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0666) +} + +// WopenFile opens a buffered reader. +// If f == "-", then stdout will be used. +// If f endswith ".gz", then the output will be gzipped. +// If f endswith ".xz", then the output will be zx-compressed. +// If f endswith ".bz2", then the output will be bzip2-compressed. +func WopenFile(f string, flag int, perm os.FileMode) (*Writer, error) { + var wtr *os.File + if f == "-" { + wtr = os.Stdout + } else { + dir := filepath.Dir(f) + fi, err := os.Stat(dir) + if err == nil && !fi.IsDir() { + return nil, fmt.Errorf("can not write file into a non-directory path: %s", dir) + } + if os.IsNotExist(err) { + os.MkdirAll(dir, 0755) + } + wtr, err = os.OpenFile(f, flag, perm) + if err != nil { + return nil, err + } + } + + f2 := strings.ToLower(f) + if strings.HasSuffix(f2, ".gz") { + gz, err := gzip.NewWriterLevel(wtr, Level) + if err != nil { + err = errors.New(fmt.Sprintf("xopen: %s", err)) + } + return &Writer{bufio.NewWriterSize(gz, bufSize), wtr, gz, nil, nil, nil}, err + } + if strings.HasSuffix(f2, ".xz") { + xw, err := xz.NewWriter(wtr) + return &Writer{bufio.NewWriterSize(xw, bufSize), wtr, nil, xw, nil, nil}, err + } + if strings.HasSuffix(f2, ".zst") { + level := Level + if level == gzip.DefaultCompression { + level = 2 + } + zw, err := zstd.NewWriter(wtr, zstd.WithEncoderLevel(zstd.EncoderLevel(level))) + if err != nil { + err = errors.New(fmt.Sprintf("xopen: zstd: %s", err)) + } + return &Writer{bufio.NewWriterSize(zw, bufSize), wtr, nil, nil, zw, nil}, err + } + if strings.HasSuffix(f2, ".bz2") { + level := Level + if level == gzip.DefaultCompression { + level = 6 + } + bz2, err := bzip2.NewWriter(wtr, &bzip2.WriterConfig{Level: level}) + if err != nil { + err = errors.New(fmt.Sprintf("xopen: %s", err)) + } + return &Writer{bufio.NewWriterSize(bz2, bufSize), wtr, nil, nil, nil, bz2}, err + } + return &Writer{bufio.NewWriterSize(wtr, bufSize), wtr, nil, nil, nil, nil}, nil +} diff --git a/pkg/obiformats/xopen_test.go b/pkg/obiformats/xopen_test.go new file mode 100644 index 0000000..b40ec11 --- /dev/null +++ b/pkg/obiformats/xopen_test.go @@ -0,0 +1,148 @@ +package obiformats + +import ( + "bufio" + "bytes" + "compress/gzip" + "fmt" + "io" + "os" + "strings" + "testing" + + . "gopkg.in/check.v1" +) + +func Test(t *testing.T) { TestingT(t) } + +type XopenTest struct{} + +var _ = Suite(&XopenTest{}) + +func gzFromString(s string) string { + var c bytes.Buffer + gz := gzip.NewWriter(&c) + gz.Write([]byte(s)) + return c.String() +} + +var gzTests = []struct { + isGz bool + data string +}{ + {false, "asdf"}, + {true, gzFromString("asdf")}, +} + +func (s *XopenTest) TestIsGzip(c *C) { + for _, t := range gzTests { + isGz, err := IsGzip(bufio.NewReader(strings.NewReader(t.data))) + c.Assert(err, IsNil) + c.Assert(t.isGz, Equals, isGz) + } +} + +func (s *XopenTest) TestIsStdin(c *C) { + r := IsStdin() + c.Assert(r, Equals, false) +} + +func (s *XopenTest) TestRopen(c *C) { + rdr, err := Ropen("-") + c.Assert(err, ErrorMatches, "stdin not detected") + c.Assert(rdr, IsNil) +} + +func (s *XopenTest) TestWopen(c *C) { + for _, f := range []string{"t.gz", "t"} { + testString := "ASDF1234" + wtr, err := Wopen(f) + c.Assert(err, IsNil) + _, err = os.Stat(f) + c.Assert(err, IsNil) + c.Assert(wtr.wtr, NotNil) + fmt.Fprintf(wtr, testString) + wtr.Close() + + rdr, err := Ropen(f) + c.Assert(err, IsNil) + + str, err := rdr.ReadString(99) + c.Assert(str, Equals, testString) + c.Assert(err, Equals, io.EOF) + str, err = rdr.ReadString(99) + c.Assert(str, Equals, "") + + rdr.Close() + os.Remove(f) + } +} + +var httpTests = []struct { + url string + expectError bool +}{ + {"https://raw.githubusercontent.com/brentp/xopen/master/README.md", false}, + {"http://raw.githubusercontent.com/brentp/xopen/master/README.md", false}, + {"http://raw.githubusercontent.com/brentp/xopen/master/BAD.md", true}, +} + +func (s *XopenTest) TestReadHttp(c *C) { + for _, t := range httpTests { + rdr, err := Ropen(t.url) + if !t.expectError { + c.Assert(err, IsNil) + v, err := rdr.ReadString(byte('\n')) + c.Assert(err, IsNil) + c.Assert(len(v), Not(Equals), 0) + } else { + c.Assert(err, ErrorMatches, ".* 404 Not Found") + } + } +} + +func (s *XopenTest) TestReadProcess(c *C) { + for _, cmd := range []string{"|ls -lh", "|ls", "|ls -lh xopen_test.go"} { + rdr, err := Ropen(cmd) + c.Assert(err, IsNil) + b := make([]byte, 1000) + _, err = rdr.Read(b) + if err != io.EOF { + c.Assert(err, IsNil) + } + lines := strings.Split(string(b), "\n") + has := false + for _, line := range lines { + if strings.Contains(line, "xopen_test.go") { + has = true + } + } + c.Assert(has, Equals, true) + } +} + +func (s *XopenTest) TestOpenStdout(c *C) { + w, err := Wopen("-") + c.Assert(err, IsNil) + c.Assert(w.wtr, Equals, os.Stdout) +} + +func (s *XopenTest) TestOpenBadFile(c *C) { + r, err := Ropen("XXXXXXXXXXXXXXXXXXXXXXX") + c.Assert(r, IsNil) + c.Assert(err, ErrorMatches, ".*no such file.*") +} + +func (s *XopenTest) TestExists(c *C) { + c.Assert(Exists("xopen.go"), Equals, true) + c.Assert(Exists("____xx"), Equals, false) +} + +func (s *XopenTest) TestUser(c *C) { + c.Assert(Exists("~"), Equals, true) +} + +func (s *XopenTest) TestExpand(c *C) { + _, err := ExpandUser("~baduser66") + c.Assert(err, Not(IsNil)) +} diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index 927f46c..f6901ec 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -461,7 +461,7 @@ func (iterator IBioSequence) Recycle() { // iterator.Get() batch := iterator.Get() log.Debugln("Recycling batch #", batch.Order()) - recycled+=batch.Len() + recycled += batch.Len() batch.Recycle(true) } log.Debugf("End of the recycling of %d Bioseq objects", recycled) @@ -679,7 +679,7 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice { chunck := obiseq.MakeBioSequenceSlice() for iterator.Next() { b := iterator.Get() - log.Debugf("append %d sequences",b.Len()) + log.Debugf("append %d sequences", b.Len()) chunck = append(chunck, b.Slice()...) b.Recycle(false) } @@ -687,7 +687,14 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice { return chunck } -func (iterator IBioSequence) FullFileIterator() IBioSequence { +// CompleteFileIterator generates a new iterator for reading a complete file. +// +// This iterator reads all the remaining sequences in the file, and returns them as a +// single obiseq.BioSequenceSlice. +// +// The function takes no parameters. +// It returns an IBioSequence object. +func (iterator IBioSequence) CompleteFileIterator() IBioSequence { newIter := MakeIBioSequence() log.Debug("Stream is read in full file mode") @@ -700,7 +707,7 @@ func (iterator IBioSequence) FullFileIterator() IBioSequence { go func() { slice := iterator.Load() - log.Printf("A batch of %d sequence is read",len(slice)) + log.Printf("A batch of %d sequence is read", len(slice)) if len(slice) > 0 { newIter.Push(MakeBioSequenceBatch(0, slice)) } @@ -716,7 +723,6 @@ func (iterator IBioSequence) FullFileIterator() IBioSequence { // It takes a slice of BioSequence objects, and returns an iterator that will return batches of // BioSequence objects - func IBatchOver(data obiseq.BioSequenceSlice, size int, sizes ...int) IBioSequence { diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index e25f3d4..69b4f20 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -203,77 +203,135 @@ func (s *BioSequence) Len() int { return len(s.sequence) } -// Checking if the BioSequence has quality scores. +// HasQualities checks if the BioSequence has sequence qualitiy scores. +// +// This function does not have any parameters. +// It returns a boolean value indicating whether the BioSequence has qualities. func (s *BioSequence) HasQualities() bool { return len(s.qualities) > 0 } -// Returning the qualities of the sequence. +// Qualities returns the sequence quality scores of the BioSequence. +// +// It checks if the BioSequence has qualities. If it does, it returns the qualities +// stored in the BioSequence struct. Otherwise, it creates and returns default +// qualities based on the length of the sequence. +// +// Returns: +// - Quality: The quality of the BioSequence. func (s *BioSequence) Qualities() Quality { if s.HasQualities() { return s.qualities - } else { - return __make_default_qualities__(len(s.sequence)) } + return __make_default_qualities__(len(s.sequence)) } +// Features returns the feature string of the BioSequence. +// +// The feature string contains the EMBL/GenBank not parsed feature table +// +// as extracted from the flat file. +// +// No parameters. +// Returns a string. func (s *BioSequence) Features() string { return string(s.feature) } -// Checking if the BioSequence has annotations. +// HasAnnotation checks if the BioSequence has any annotations. +// +// It does not take any parameters. +// It returns a boolean value indicating whether the BioSequence has any annotations. func (s *BioSequence) HasAnnotation() bool { return len(s.annotations) > 0 } -// Returning the annotations of the BioSequence. +// Annotations returns the Annotation object associated with the BioSequence. +// +// This function does not take any parameters. +// It returns an Annotation object. func (s *BioSequence) Annotations() Annotation { - if s.annotations == nil { s.annotations = GetAnnotation() } - return s.annotations } +// AnnotationsLock locks the annotation of the BioSequence. +// +// This function acquires a lock on the annotation of the BioSequence, +// preventing concurrent access to it. func (s *BioSequence) AnnotationsLock() { s.annot_lock.Lock() } +// AnnotationsUnlock unlocks the annotations mutex in the BioSequence struct. +// +// No parameters. +// No return types. func (s *BioSequence) AnnotationsUnlock() { s.annot_lock.Unlock() } -// Checking if the BioSequence has a source. +// HasSource checks if the BioSequence has a source. +// +// The source is the filename without directory name and extension from where the sequence was read. +// +// No parameters. +// Returns a boolean value indicating whether the BioSequence has a source or not. func (s *BioSequence) HasSource() bool { return len(s.source) > 0 } +// Source returns the source of the BioSequence. +// +// The source is the filename without directory name and extension from where the sequence was read. +// +// This function does not take any parameters. +// It returns a string. func (s *BioSequence) Source() string { return s.source } -// Returning the MD5 hash of the sequence. +// MD5 calculates the MD5 hash of the BioSequence. +// +// No parameters. +// Returns [16]byte, the MD5 hash of the BioSequence. func (s *BioSequence) MD5() [16]byte { return md5.Sum(s.sequence) } -// Setting the id of the BioSequence. +// SetId sets the id of the BioSequence. +// +// Parameters: +// - id: the new id for the BioSequence. +// +// No return value. func (s *BioSequence) SetId(id string) { s.id = id } -// Setting the definition of the sequence. +// SetDefinition sets the definition of the BioSequence. +// +// It takes a string parameter 'definition' and assigns it to the 'definition' field of the BioSequence struct. func (s *BioSequence) SetDefinition(definition string) { s.definition = definition } -// Setting the source of the sequence. +// SetSource sets the source of the BioSequence. +// +// Parameter: +// - source: a string representing the filename without directory name and extension from where the sequence was read. func (s *BioSequence) SetSource(source string) { s.source = source } -// Setting the features of the BioSequence. +// SetFeatures sets the feature of the BioSequence. +// +// Parameters: +// - feature: a byte slice representing the feature to be set. +// +// No return value. func (s *BioSequence) SetFeatures(feature []byte) { if cap(s.feature) >= 300 { RecycleSlice(&s.feature) diff --git a/pkg/obiseq/biosequence_test.go b/pkg/obiseq/biosequence_test.go index 9311b7e..4089a18 100644 --- a/pkg/obiseq/biosequence_test.go +++ b/pkg/obiseq/biosequence_test.go @@ -328,6 +328,225 @@ func TestBioSequence_Len(t *testing.T) { } } +// TestHasQualities tests the HasQualities method of the BioSequence struct. +// +// It includes two test cases: +// +// 1. Test case 1: BioSequence with empty qualities slice +// - Creates a BioSequence instance with an empty qualities slice. +// - Expects false as the result of calling the HasQualities method on the BioSequence instance. +// +// 2. Test case 2: BioSequence with non-empty qualities slice +// - Creates a BioSequence instance with a non-empty qualities slice. +// - Expects true as the result of calling the HasQualities method on the BioSequence instance. +// +// No parameters are required. +// No return types are specified. +func TestHasQualities(t *testing.T) { + // Test case 1: BioSequence with empty qualities slice + seq1 := NewBioSequence("", []byte(""), "") + seq1.qualities = []byte{} + if seq1.HasQualities() != false { + t.Errorf("Test case 1 failed: expected false, got true") + } + + // Test case 2: BioSequence with non-empty qualities slice + seq2 := NewBioSequence("", []byte(""), "") + seq2.qualities = []byte{20, 30, 40} + if seq2.HasQualities() != true { + t.Errorf("Test case 2 failed: expected true, got false") + } +} + +// TestQualities tests the Qualities method of the BioSequence struct. +// +// It creates a BioSequence with a given sequence and qualities and sets them. +// Then it compares the returned qualities with the expected ones. +// If the qualities are not equal, it fails the test case. +// +// Test case 1: BioSequence has qualities +// - sequence: []byte("ATCG") +// - qualities: Quality{10, 20, 30, 40} +// - expected: Quality{10, 20, 30, 40} +// +// Test case 2: BioSequence does not have qualities +// - sequence: []byte("ATCG") +// - qualities: nil +// - expected: defaultQualities +// +// Parameters: +// - t: *testing.T - the testing struct for running test cases and reporting failures. +// +// Return type: +// None +func TestQualities(t *testing.T) { + // Test case: BioSequence has qualities + sequence := []byte("ATCG") + qualities := Quality{10, 20, 30, 40} + bioSeq := NewBioSequence("ABC123", sequence, "Test Sequence") + bioSeq.SetQualities(qualities) + + result := bioSeq.Qualities() + expected := qualities + + if !reflect.DeepEqual(result, expected) { + t.Errorf("Test case failed: BioSequence has qualities") + } + + // Test case: BioSequence does not have qualities + defaultQualities := __make_default_qualities__(len(sequence)) + bioSeq = NewBioSequence("ABC123", sequence, "Test Sequence") + bioSeq.SetQualities(nil) + + result = bioSeq.Qualities() + expected = defaultQualities + + if !reflect.DeepEqual(result, expected) { + t.Errorf("Test case failed: BioSequence does not have qualities") + } +} + +// TestBioSequence_Features tests the Features function of the BioSequence struct. +// +// It first tests the case when the feature string is empty. It creates a new BioSequence +// with an empty feature string and an empty byte slice. It expects an empty string as +// the result of calling the Features function on this BioSequence. If the result does +// not match the expected value, it prints an error message. +// +// It then tests the case when the feature string is non-empty. It creates a new BioSequence +// with an empty feature string and an empty byte slice. It sets the feature string to +// "test sequence" and expects "test sequence" as the result of calling the Features function +// on this BioSequence. If the result does not match the expected value, it prints an error message. +func TestBioSequence_Features(t *testing.T) { + // Testing empty feature string + seq := NewBioSequence("", []byte(""), "") + expected := "" + if got := seq.Features(); got != expected { + t.Errorf("Expected %q, but got %q", expected, got) + } + + // Testing non-empty feature string + seq = NewBioSequence("", []byte(""), "") + seq.feature = []byte("test sequence") + expected = "test sequence" + if got := seq.Features(); got != expected { + t.Errorf("Expected %q, but got %q", expected, got) + } +} + +// TestHasAnnotation is a unit test function that tests the HasAnnotation method of the BioSequence struct. +// +// This function tests the behavior of the HasAnnotation method in different scenarios: +// - Test case: BioSequence with no annotations. +// - Test case: BioSequence with one annotation. +// - Test case: BioSequence with multiple annotations. +// +// The function verifies that the HasAnnotation method returns the expected boolean value for each test case. +// It uses the *testing.T parameter to report any test failures. +// +// No parameters. +// No return values. +func TestHasAnnotation(t *testing.T) { + // Test case: BioSequence with no annotations + seq := BioSequence{} + expected := false + if got := seq.HasAnnotation(); got != expected { + t.Errorf("Expected %v, but got %v", expected, got) + } + + // Test case: BioSequence with one annotation + seq = BioSequence{annotations: map[string]interface{}{"annotation1": "value1"}} + expected = true + if got := seq.HasAnnotation(); got != expected { + t.Errorf("Expected %v, but got %v", expected, got) + } + + // Test case: BioSequence with multiple annotations + seq = BioSequence{ + annotations: map[string]interface{}{ + "annotation1": "value1", + "annotation2": "value2", + }, + } + expected = true + if got := seq.HasAnnotation(); got != expected { + t.Errorf("Expected %v, but got %v", expected, got) + } +} + +// TestBioSequenceAnnotations tests the Annotations method of the BioSequence struct. +// +// It verifies the behavior of the method when the `annotations` field of the BioSequence struct is nil and when it is not nil. +// The method should return the expected annotation values and fail the test if the returned annotations do not match the expected ones. +// The test cases cover both scenarios to ensure the correctness of the method. +func TestBioSequenceAnnotations(t *testing.T) { + s := &BioSequence{} + + // Test case 1: Annotations is nil + s.annotations = nil + expected := GetAnnotation() + actual := s.Annotations() + if !reflect.DeepEqual(expected, actual) { + t.Errorf("Test case 1 failed: Expected %v, but got %v", expected, actual) + } + + // Test case 2: Annotations is not nil + s.annotations = Annotation{} + expected = s.annotations + actual = s.Annotations() + if !reflect.DeepEqual(expected, actual) { + t.Errorf("Test case 2 failed: Expected %v, but got %v", expected, actual) + } +} + +func TestAnnotationsLock(t *testing.T) { + // Test case 1: Lock the annotation of an empty BioSequence + seq := NewEmptyBioSequence(0) + seq.AnnotationsLock() + + // Test case 2: Lock the annotation of a BioSequence with existing annotations + seq2 := NewEmptyBioSequence(0) + seq2.annotations = map[string]interface{}{ + "key1": "value1", + "key2": "value2", + } + seq2.AnnotationsLock() +} + +// TestBioSequence_MD5 tests the MD5 function of the BioSequence struct. +// +// It includes two test cases: one for an empty sequence and one for a non-empty sequence. +// Each test case creates a BioSequence instance with a specific sequence and compares the MD5 result with the expected value. +// If the result does not match the expected value, an error is reported using the t.Errorf function. +// The expected MD5 values are hardcoded in the test cases. +func TestBioSequence_MD5(t *testing.T) { + // Test case 1: Empty sequence + { + s := &BioSequence{sequence: []byte("")} + expected := [16]byte{ + 0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04, + 0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e, + } + result := s.MD5() + if result != expected { + t.Errorf("Test case 1 failed. Expected: %v, got: %v", expected, result) + } + } + + // Test case 2: Non-empty sequence + { + s := &BioSequence{sequence: []byte("ACGT")} + expected := [16]byte{ + 0xf1, 0xf8, 0xf4, 0xbf, 0x41, 0x3b, 0x16, 0xad, + 0x13, 0x57, 0x22, 0xaa, 0x45, 0x91, 0x04, 0x3e, + } + result := s.MD5() + if result != expected { + t.Errorf("Test case 2 failed. Expected: %v, got: %v", expected, result) + } + } +} + // TestBioSequence_Composition tests the Composition method of the BioSequence struct. // // It tests the method with three different test cases: diff --git a/pkg/obiseq/biosequenceslice.go b/pkg/obiseq/biosequenceslice.go index 3337512..1ae2ee7 100644 --- a/pkg/obiseq/biosequenceslice.go +++ b/pkg/obiseq/biosequenceslice.go @@ -1,8 +1,10 @@ package obiseq import ( - log "github.com/sirupsen/logrus" "sync" + + log "github.com/sirupsen/logrus" + "golang.org/x/exp/slices" ) // BioSequenceSlice represents a collection or a set of BioSequence. @@ -18,22 +20,39 @@ var _BioSequenceSlicePool = sync.Pool{ }, } -// > This function returns a pointer to a new `BioSequenceSlice` object +// NewBioSequenceSlice returns a new BioSequenceSlice with the specified size. +// +// The size parameter is optional. If provided, the returned slice will be +// resized accordingly. +// +// Returns a pointer to the newly created BioSequenceSlice. func NewBioSequenceSlice(size ...int) *BioSequenceSlice { slice := _BioSequenceSlicePool.Get().(*BioSequenceSlice) if len(size) > 0 { s := size[0] slice = slice.InsureCapacity(s) - (*slice)=(*slice)[0:s] + (*slice) = (*slice)[0:s] } return slice } -// `MakeBioSequenceSlice()` returns a pointer to a new `BioSequenceSlice` struct +// MakeBioSequenceSlice creates a new BioSequenceSlice with the specified size(s). +// +// Parameters: +// - size: The size(s) of the BioSequenceSlice to create (optional). +// +// Return: +// A new BioSequenceSlice with the specified size(s). func MakeBioSequenceSlice(size ...int) BioSequenceSlice { return *NewBioSequenceSlice(size...) } +// Recycle cleans up the BioSequenceSlice by recycling its elements and resetting its length. +// +// If including_seq is true, each element of the BioSequenceSlice is recycled using the Recycle method, +// and then set to nil. If including_seq is false, each element is simply set to nil. +// +// The function does not return anything. func (s *BioSequenceSlice) Recycle(including_seq bool) { if s == nil { log.Panicln("Trying too recycle a nil pointer") @@ -42,60 +61,113 @@ func (s *BioSequenceSlice) Recycle(including_seq bool) { // Code added to potentially limit memory leaks if including_seq { for i := range *s { - (*s)[i] .Recycle() + (*s)[i].Recycle() (*s)[i] = nil } - + } else { for i := range *s { (*s)[i] = nil - } + } } *s = (*s)[:0] _BioSequenceSlicePool.Put(s) } -// Making sure that the slice has enough capacity to hold the number of elements that are being added -// to it. +// InsureCapacity ensures that the BioSequenceSlice has a minimum capacity +// +// It takes an integer `capacity` as a parameter, which represents the desired minimum capacity of the BioSequenceSlice. +// It returns a pointer to the BioSequenceSlice. func (s *BioSequenceSlice) InsureCapacity(capacity int) *BioSequenceSlice { var c int if s != nil { - c = cap(*s) + c = cap(*s) } else { c = 0 } - if c < capacity { - sl := make(BioSequenceSlice, 0,capacity) - s = &sl - } + *s = slices.Grow[BioSequenceSlice](*s, capacity-c) return s } -// Appending the sequence to the slice. +// Push appends a BioSequence to the BioSequenceSlice. +// +// It takes a pointer to a BioSequenceSlice and a BioSequence as parameters. +// It does not return anything. func (s *BioSequenceSlice) Push(sequence *BioSequence) { *s = append(*s, sequence) } -// Returning the last element of the slice and removing it from the slice. +// Pop returns and removes the last element from the BioSequenceSlice. +// +// It does not take any parameters. +// It returns *BioSequence, the last element of the slice. func (s *BioSequenceSlice) Pop() *BioSequence { - _s := (*s)[len(*s)-1] - (*s)[len(*s)-1] = nil - *s = (*s)[:len(*s)-1] - return _s + // Get the length of the slice + length := len(*s) + + // If the slice is empty, return nil + if length == 0 { + return nil + } + + // Get the last element of the slice + lastElement := (*s)[length-1] + + // Set the last element to nil + (*s)[length-1] = nil + + // Remove the last element from the slice + *s = (*s)[:length-1] + + // Return the last element + return lastElement } -// Returning the first element of the slice and removing it from the slice. +// Pop0 returns and removes the first element of the BioSequenceSlice. +// +// It does not take any parameters. +// It returns a pointer to a BioSequence object. func (s *BioSequenceSlice) Pop0() *BioSequence { - _s := (*s)[0] + if len(*s) == 0 { + return nil + } + firstElement := (*s)[0] (*s)[0] = nil *s = (*s)[1:] - return _s + return firstElement } -// Test that a slice of sequences contains at least a sequence. +// NotEmpty checks if the BioSequenceSlice is not empty. +// +// No parameters. +// Returns a boolean value indicating if the BioSequenceSlice is not empty. func (s BioSequenceSlice) NotEmpty() bool { return len(s) > 0 } + +// Len returns the length of the BioSequenceSlice. +// +// It has no parameters. +// It returns an integer. +func (s BioSequenceSlice) Len() int { + return len(s) +} + +// Size returns the total size of the BioSequenceSlice. +// +// It calculates the size by iterating over each BioSequence in the slice +// and summing up their lengths. +// +// Returns an integer representing the total size of the BioSequenceSlice. +func (s BioSequenceSlice) Size() int { + size := 0 + + for _, s := range s { + size += s.Len() + } + + return size +}