From aa42df326a7436caaf44c9c499204d1e211e4a4b Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 4 Jun 2024 11:57:16 +0200 Subject: [PATCH] Correct a bug in the fastq reader affecting the quality of the last record of each chunk Former-commit-id: b842d60af9c2f1f971946d99999d13cfc15793b3 --- pkg/obiformats/fastaseq_read.go | 19 ++++--- pkg/obiformats/fastqseq_read.go | 73 +++++++++++++++++++-------- pkg/obiformats/seqfile_chunck_read.go | 5 -- pkg/obioptions/version.go | 5 +- pkg/obiseq/biosequence.go | 16 +++--- pkg/obiseq/pool.go | 28 +++++++++- 6 files changed, 104 insertions(+), 42 deletions(-) diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index 46b6235..e737101 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -6,7 +6,6 @@ import ( "io" "os" "path" - "slices" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" @@ -155,7 +154,11 @@ func _ParseFastaFile(source string, if C == '>' { if previous == '\r' || previous == '\n' { // End of sequence - s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition) + rawseq := seqBytes.Bytes() + if len(rawseq) == 0 { + log.Fatalf("@%s[%s] : sequence is empty", identifier, source) + } + s := obiseq.NewBioSequence(identifier, rawseq, definition) s.SetSource(source) sequences = append(sequences, s) if no_order { @@ -198,17 +201,21 @@ func _ParseFastaFile(source string, } if state == 6 { - s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition) + rawseq := seqBytes.Bytes() + if len(rawseq) == 0 { + log.Fatalf("@%s[%s] : sequence is empty", identifier, source) + } + s := obiseq.NewBioSequence(identifier, rawseq, definition) s.SetSource(source) sequences = append(sequences, s) } if len(sequences) > 0 { + co := chunks.order if no_order { - out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) - } else { - out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences)) + co = chunck_order() } + out.Push(obiiter.MakeBioSequenceBatch(co, sequences)) } } diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index 67ed182..becef96 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -6,7 +6,6 @@ import ( "io" "os" "path" - "slices" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" @@ -97,9 +96,27 @@ func _EndOfLastFastqEntry(buffer []byte) int { if i == 0 || state != 7 { return -1 } + return cut } +func _storeSequenceQuality(bytes *bytes.Buffer, out *obiseq.BioSequence, quality_shift byte) { + q := bytes.Bytes() + if len(q) == 0 { + log.Fatalf("@%s[%s] : sequence quality is empty", out.Id(), out.Source()) + } + + if len(q) != out.Len() { + log.Fatalf("%s[%s] : sequence data and quality lenght not equal (%d <> %d)", + out.Id(), out.Source(), len(q), out.Len()) + } + + for i := 0; i < len(q); i++ { + q[i] = q[i] - quality_shift + } + out.SetQualities(q) +} + func _ParseFastqFile(source string, input ChannelSeqFileChunk, out obiiter.IBioSequence, @@ -122,6 +139,8 @@ func _ParseFastqFile(source string, for chunks := range input { scanner := bufio.NewReader(chunks.raw) sequences := make(obiseq.BioSequenceSlice, 0, 100) + previous := byte(0) + for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() { is_end_of_line := C == '\r' || C == '\n' @@ -135,12 +154,12 @@ func _ParseFastqFile(source string, // Beginning of sequence state = 1 } else { - log.Errorf("%s : sequence entry is not starting with @", source) + log.Fatalf("%s : sequence entry is not starting with @", source) } case 1: // Beginning of identifier (Mandatory) if is_sep { // No identifier -> ERROR - log.Errorf("%s : sequence identifier is empty", source) + log.Fatalf("%s : sequence identifier is empty", source) } else { // Beginning of identifier state = 2 @@ -191,7 +210,11 @@ func _ParseFastqFile(source string, case 6: if is_end_of_line { // End of sequence - s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition) + rawseq := seqBytes.Bytes() + if len(rawseq) == 0 { + log.Fatalf("@%s[%s] : sequence is empty", identifier, source) + } + s := obiseq.NewBioSequence(identifier, rawseq, definition) s.SetSource(source) sequences = append(sequences, s) state = 7 @@ -199,7 +222,16 @@ func _ParseFastqFile(source string, if C >= 'A' && C <= 'Z' { C = C + 'a' - 'A' } - seqBytes.WriteByte(C) + if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' { + seqBytes.WriteByte(C) + } else { + context, _ := scanner.Peek(30) + context = append( + append([]byte{previous}, C), + context...) + log.Fatalf("%s [%s]: sequence contains invalid character %c (%s)", + source, identifier, C, string(context)) + } } case 7: if is_end_of_line { @@ -207,9 +239,10 @@ func _ParseFastqFile(source string, } else if C == '+' { state = 8 } else { - log.Errorf("@%s[%s] : sequence data not followed by a line starting with + but a %c", identifier, source, C) + log.Fatalf("@%s[%s] : sequence data not followed by a line starting with + but a %c", identifier, source, C) } case 8: + // State consuming the + internal header line if is_end_of_line { state = 9 } @@ -224,16 +257,7 @@ func _ParseFastqFile(source string, } case 10: if is_end_of_line { - // End of quality - q := qualBytes.Bytes() - if len(q) != sequences[len(sequences)-1].Len() { - log.Errorf("%s[%s] : sequence data and quality lenght not equal (%d/%d)", - identifier, source, len(q), sequences[len(sequences)-1].Len()) - } - for i := 0; i < len(q); i++ { - q[i] = q[i] - quality_shift - } - sequences[len(sequences)-1].SetQualities(q) + _storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift) if no_order { if len(sequences) == batch_size { @@ -252,18 +276,25 @@ func _ParseFastqFile(source string, } else if C == '@' { state = 1 } else { - log.Errorf("%s[%s] : sequence record not followed by a line starting with @", identifier, source) + log.Fatalf("%s[%s] : sequence record not followed by a line starting with @", identifier, source) } } + + previous = C } if len(sequences) > 0 { - if no_order { - out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) - } else { - out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences)) + if state == 10 { + _storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift) + state = 1 } + + co := chunks.order + if no_order { + co = chunck_order() + } + out.Push(obiiter.MakeBioSequenceBatch(co, sequences)) } } diff --git a/pkg/obiformats/seqfile_chunck_read.go b/pkg/obiformats/seqfile_chunck_read.go index eb98424..07e6ef4 100644 --- a/pkg/obiformats/seqfile_chunck_read.go +++ b/pkg/obiformats/seqfile_chunck_read.go @@ -91,11 +91,6 @@ func ReadSeqFileChunk(reader io.Reader, io := bytes.NewBuffer(slices.Clone(buff)) chunk_channel <- SeqFileChunk{io, i} i++ - - // if string(buff[io.Len()-2:]) != "//" { - // log.Fatalf("File chunck ends with 3 bytes : %s", io.Bytes()[io.Len()-3:]) - // } - } if lremain > 0 { diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index a006513..8095363 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -4,7 +4,10 @@ import ( "fmt" ) -var _Commit = "" +// TODO: The version number is extracted from git. This induces that the version +// corresponds to the last commit, and not the one when the file will be +// commited +var _Commit = "f4fcc19" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 88b1a07..2cab788 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -273,19 +273,19 @@ func (s *BioSequence) Qualities() Quality { // Returns a string representing the qualities of the BioSequence after applying the shift. func (s *BioSequence) QualitiesString() string { quality_shift := obioptions.OutputQualityShift() + qual := s.Qualities() - qual_ascii := make([]byte, len(qual)) + qual_ascii := GetSlice(len(qual))[0:len(qual)] for i := 0; i < len(qual); i++ { quality := qual[i] - if quality < 0 { - quality = 0 - } if quality > 93 { quality = 93 } qual_ascii[i] = quality + quality_shift } - return string(qual_ascii) + qual_sting := string(qual_ascii) + RecycleSlice(&qual_ascii) + return qual_sting } // Features returns the feature string of the BioSequence. @@ -420,7 +420,8 @@ func (s *BioSequence) SetSequence(sequence []byte) { if s.sequence != nil { RecycleSlice(&s.sequence) } - s.sequence = obiutils.InPlaceToLower(sequence) + s.sequence = GetSlice(len(sequence))[0:len(sequence)] + copy(s.sequence, obiutils.InPlaceToLower(sequence)) } // Setting the qualities of the BioSequence. @@ -428,7 +429,8 @@ func (s *BioSequence) SetQualities(qualities Quality) { if s.qualities != nil { RecycleSlice(&s.qualities) } - s.qualities = qualities + s.qualities = GetSlice(len(qualities))[0:len(qualities)] + copy(s.qualities, qualities) } // A method that appends a byte slice to the qualities of the BioSequence. diff --git a/pkg/obiseq/pool.go b/pkg/obiseq/pool.go index c31c6f3..aa6cb1c 100644 --- a/pkg/obiseq/pool.go +++ b/pkg/obiseq/pool.go @@ -15,6 +15,17 @@ var _BioSequenceByteSlicePool = sync.Pool{ }, } +// RecycleSlice recycles a byte slice by clearing its contents and returning it +// to a pool if it is small enough. +// +// Parameters: - s: a pointer to a byte slice that will be recycled. +// +// This function first checks if the input slice is not nil and has a non-zero +// capacity. If so, it clears the contents of the slice by setting its length to +// 0. Then, it checks if the capacity of the slice is less than or equal to +// 1024. If it is, the function puts the slice into a pool for reuse. If the +// capacity is 0 or greater than 1024, the function does nothing. If the input +// slice is nil or has a zero capacity, the function logs a panic message. func RecycleSlice(s *[]byte) { if s != nil && cap(*s) > 0 { *s = (*s)[:0] @@ -27,9 +38,22 @@ func RecycleSlice(s *[]byte) { } } -// It returns a slice of bytes from a pool of slices. +// GetSlice returns a byte slice with the specified capacity. // -// the slice can be prefilled with the provided values +// The function first checks if the capacity is less than or equal to 1024. If it is, +// it retrieves a byte slice from the _BioSequenceByteSlicePool. If the retrieved +// slice is nil, has a nil underlying array, or has a capacity less than the +// specified capacity, a new byte slice is created with the specified capacity. +// If the capacity is greater than 1024, a new byte slice is created with the +// specified capacity. +// +// The function returns the byte slice. +// +// Parameters: +// - capacity: the desired capacity of the byte slice. +// +// Return type: +// - []byte: the byte slice with the specified capacity. func GetSlice(capacity int) []byte { p := (*[]byte)(nil) if capacity <= 1024 {