mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Correct a bug in the fastq reader affecting the quality of the last record of each chunk
Former-commit-id: b842d60af9c2f1f971946d99999d13cfc15793b3
This commit is contained in:
@ -6,7 +6,6 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"slices"
|
|
||||||
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
@ -155,7 +154,11 @@ func _ParseFastaFile(source string,
|
|||||||
if C == '>' {
|
if C == '>' {
|
||||||
if previous == '\r' || previous == '\n' {
|
if previous == '\r' || previous == '\n' {
|
||||||
// End of sequence
|
// End of sequence
|
||||||
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
|
rawseq := seqBytes.Bytes()
|
||||||
|
if len(rawseq) == 0 {
|
||||||
|
log.Fatalf("@%s[%s] : sequence is empty", identifier, source)
|
||||||
|
}
|
||||||
|
s := obiseq.NewBioSequence(identifier, rawseq, definition)
|
||||||
s.SetSource(source)
|
s.SetSource(source)
|
||||||
sequences = append(sequences, s)
|
sequences = append(sequences, s)
|
||||||
if no_order {
|
if no_order {
|
||||||
@ -198,17 +201,21 @@ func _ParseFastaFile(source string,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if state == 6 {
|
if state == 6 {
|
||||||
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
|
rawseq := seqBytes.Bytes()
|
||||||
|
if len(rawseq) == 0 {
|
||||||
|
log.Fatalf("@%s[%s] : sequence is empty", identifier, source)
|
||||||
|
}
|
||||||
|
s := obiseq.NewBioSequence(identifier, rawseq, definition)
|
||||||
s.SetSource(source)
|
s.SetSource(source)
|
||||||
sequences = append(sequences, s)
|
sequences = append(sequences, s)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(sequences) > 0 {
|
if len(sequences) > 0 {
|
||||||
|
co := chunks.order
|
||||||
if no_order {
|
if no_order {
|
||||||
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
co = chunck_order()
|
||||||
} else {
|
|
||||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
|
|
||||||
}
|
}
|
||||||
|
out.Push(obiiter.MakeBioSequenceBatch(co, sequences))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,7 +6,6 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"slices"
|
|
||||||
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||||
@ -97,9 +96,27 @@ func _EndOfLastFastqEntry(buffer []byte) int {
|
|||||||
if i == 0 || state != 7 {
|
if i == 0 || state != 7 {
|
||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
|
|
||||||
return cut
|
return cut
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func _storeSequenceQuality(bytes *bytes.Buffer, out *obiseq.BioSequence, quality_shift byte) {
|
||||||
|
q := bytes.Bytes()
|
||||||
|
if len(q) == 0 {
|
||||||
|
log.Fatalf("@%s[%s] : sequence quality is empty", out.Id(), out.Source())
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(q) != out.Len() {
|
||||||
|
log.Fatalf("%s[%s] : sequence data and quality lenght not equal (%d <> %d)",
|
||||||
|
out.Id(), out.Source(), len(q), out.Len())
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < len(q); i++ {
|
||||||
|
q[i] = q[i] - quality_shift
|
||||||
|
}
|
||||||
|
out.SetQualities(q)
|
||||||
|
}
|
||||||
|
|
||||||
func _ParseFastqFile(source string,
|
func _ParseFastqFile(source string,
|
||||||
input ChannelSeqFileChunk,
|
input ChannelSeqFileChunk,
|
||||||
out obiiter.IBioSequence,
|
out obiiter.IBioSequence,
|
||||||
@ -122,6 +139,8 @@ func _ParseFastqFile(source string,
|
|||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
scanner := bufio.NewReader(chunks.raw)
|
scanner := bufio.NewReader(chunks.raw)
|
||||||
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
||||||
|
previous := byte(0)
|
||||||
|
|
||||||
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
|
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
|
||||||
|
|
||||||
is_end_of_line := C == '\r' || C == '\n'
|
is_end_of_line := C == '\r' || C == '\n'
|
||||||
@ -135,12 +154,12 @@ func _ParseFastqFile(source string,
|
|||||||
// Beginning of sequence
|
// Beginning of sequence
|
||||||
state = 1
|
state = 1
|
||||||
} else {
|
} else {
|
||||||
log.Errorf("%s : sequence entry is not starting with @", source)
|
log.Fatalf("%s : sequence entry is not starting with @", source)
|
||||||
}
|
}
|
||||||
case 1: // Beginning of identifier (Mandatory)
|
case 1: // Beginning of identifier (Mandatory)
|
||||||
if is_sep {
|
if is_sep {
|
||||||
// No identifier -> ERROR
|
// No identifier -> ERROR
|
||||||
log.Errorf("%s : sequence identifier is empty", source)
|
log.Fatalf("%s : sequence identifier is empty", source)
|
||||||
} else {
|
} else {
|
||||||
// Beginning of identifier
|
// Beginning of identifier
|
||||||
state = 2
|
state = 2
|
||||||
@ -191,7 +210,11 @@ func _ParseFastqFile(source string,
|
|||||||
case 6:
|
case 6:
|
||||||
if is_end_of_line {
|
if is_end_of_line {
|
||||||
// End of sequence
|
// End of sequence
|
||||||
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
|
rawseq := seqBytes.Bytes()
|
||||||
|
if len(rawseq) == 0 {
|
||||||
|
log.Fatalf("@%s[%s] : sequence is empty", identifier, source)
|
||||||
|
}
|
||||||
|
s := obiseq.NewBioSequence(identifier, rawseq, definition)
|
||||||
s.SetSource(source)
|
s.SetSource(source)
|
||||||
sequences = append(sequences, s)
|
sequences = append(sequences, s)
|
||||||
state = 7
|
state = 7
|
||||||
@ -199,7 +222,16 @@ func _ParseFastqFile(source string,
|
|||||||
if C >= 'A' && C <= 'Z' {
|
if C >= 'A' && C <= 'Z' {
|
||||||
C = C + 'a' - 'A'
|
C = C + 'a' - 'A'
|
||||||
}
|
}
|
||||||
seqBytes.WriteByte(C)
|
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
||||||
|
seqBytes.WriteByte(C)
|
||||||
|
} else {
|
||||||
|
context, _ := scanner.Peek(30)
|
||||||
|
context = append(
|
||||||
|
append([]byte{previous}, C),
|
||||||
|
context...)
|
||||||
|
log.Fatalf("%s [%s]: sequence contains invalid character %c (%s)",
|
||||||
|
source, identifier, C, string(context))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
case 7:
|
case 7:
|
||||||
if is_end_of_line {
|
if is_end_of_line {
|
||||||
@ -207,9 +239,10 @@ func _ParseFastqFile(source string,
|
|||||||
} else if C == '+' {
|
} else if C == '+' {
|
||||||
state = 8
|
state = 8
|
||||||
} else {
|
} else {
|
||||||
log.Errorf("@%s[%s] : sequence data not followed by a line starting with + but a %c", identifier, source, C)
|
log.Fatalf("@%s[%s] : sequence data not followed by a line starting with + but a %c", identifier, source, C)
|
||||||
}
|
}
|
||||||
case 8:
|
case 8:
|
||||||
|
// State consuming the + internal header line
|
||||||
if is_end_of_line {
|
if is_end_of_line {
|
||||||
state = 9
|
state = 9
|
||||||
}
|
}
|
||||||
@ -224,16 +257,7 @@ func _ParseFastqFile(source string,
|
|||||||
}
|
}
|
||||||
case 10:
|
case 10:
|
||||||
if is_end_of_line {
|
if is_end_of_line {
|
||||||
// End of quality
|
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift)
|
||||||
q := qualBytes.Bytes()
|
|
||||||
if len(q) != sequences[len(sequences)-1].Len() {
|
|
||||||
log.Errorf("%s[%s] : sequence data and quality lenght not equal (%d/%d)",
|
|
||||||
identifier, source, len(q), sequences[len(sequences)-1].Len())
|
|
||||||
}
|
|
||||||
for i := 0; i < len(q); i++ {
|
|
||||||
q[i] = q[i] - quality_shift
|
|
||||||
}
|
|
||||||
sequences[len(sequences)-1].SetQualities(q)
|
|
||||||
|
|
||||||
if no_order {
|
if no_order {
|
||||||
if len(sequences) == batch_size {
|
if len(sequences) == batch_size {
|
||||||
@ -252,18 +276,25 @@ func _ParseFastqFile(source string,
|
|||||||
} else if C == '@' {
|
} else if C == '@' {
|
||||||
state = 1
|
state = 1
|
||||||
} else {
|
} else {
|
||||||
log.Errorf("%s[%s] : sequence record not followed by a line starting with @", identifier, source)
|
log.Fatalf("%s[%s] : sequence record not followed by a line starting with @", identifier, source)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
previous = C
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(sequences) > 0 {
|
if len(sequences) > 0 {
|
||||||
if no_order {
|
if state == 10 {
|
||||||
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift)
|
||||||
} else {
|
state = 1
|
||||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
co := chunks.order
|
||||||
|
if no_order {
|
||||||
|
co = chunck_order()
|
||||||
|
}
|
||||||
|
out.Push(obiiter.MakeBioSequenceBatch(co, sequences))
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -91,11 +91,6 @@ func ReadSeqFileChunk(reader io.Reader,
|
|||||||
io := bytes.NewBuffer(slices.Clone(buff))
|
io := bytes.NewBuffer(slices.Clone(buff))
|
||||||
chunk_channel <- SeqFileChunk{io, i}
|
chunk_channel <- SeqFileChunk{io, i}
|
||||||
i++
|
i++
|
||||||
|
|
||||||
// if string(buff[io.Len()-2:]) != "//" {
|
|
||||||
// log.Fatalf("File chunck ends with 3 bytes : %s", io.Bytes()[io.Len()-3:])
|
|
||||||
// }
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if lremain > 0 {
|
if lremain > 0 {
|
||||||
|
@ -4,7 +4,10 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _Commit = ""
|
// TODO: The version number is extracted from git. This induces that the version
|
||||||
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
|
// commited
|
||||||
|
var _Commit = "f4fcc19"
|
||||||
var _Version = "Release 4.2.0"
|
var _Version = "Release 4.2.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
@ -273,19 +273,19 @@ func (s *BioSequence) Qualities() Quality {
|
|||||||
// Returns a string representing the qualities of the BioSequence after applying the shift.
|
// Returns a string representing the qualities of the BioSequence after applying the shift.
|
||||||
func (s *BioSequence) QualitiesString() string {
|
func (s *BioSequence) QualitiesString() string {
|
||||||
quality_shift := obioptions.OutputQualityShift()
|
quality_shift := obioptions.OutputQualityShift()
|
||||||
|
|
||||||
qual := s.Qualities()
|
qual := s.Qualities()
|
||||||
qual_ascii := make([]byte, len(qual))
|
qual_ascii := GetSlice(len(qual))[0:len(qual)]
|
||||||
for i := 0; i < len(qual); i++ {
|
for i := 0; i < len(qual); i++ {
|
||||||
quality := qual[i]
|
quality := qual[i]
|
||||||
if quality < 0 {
|
|
||||||
quality = 0
|
|
||||||
}
|
|
||||||
if quality > 93 {
|
if quality > 93 {
|
||||||
quality = 93
|
quality = 93
|
||||||
}
|
}
|
||||||
qual_ascii[i] = quality + quality_shift
|
qual_ascii[i] = quality + quality_shift
|
||||||
}
|
}
|
||||||
return string(qual_ascii)
|
qual_sting := string(qual_ascii)
|
||||||
|
RecycleSlice(&qual_ascii)
|
||||||
|
return qual_sting
|
||||||
}
|
}
|
||||||
|
|
||||||
// Features returns the feature string of the BioSequence.
|
// Features returns the feature string of the BioSequence.
|
||||||
@ -420,7 +420,8 @@ func (s *BioSequence) SetSequence(sequence []byte) {
|
|||||||
if s.sequence != nil {
|
if s.sequence != nil {
|
||||||
RecycleSlice(&s.sequence)
|
RecycleSlice(&s.sequence)
|
||||||
}
|
}
|
||||||
s.sequence = obiutils.InPlaceToLower(sequence)
|
s.sequence = GetSlice(len(sequence))[0:len(sequence)]
|
||||||
|
copy(s.sequence, obiutils.InPlaceToLower(sequence))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setting the qualities of the BioSequence.
|
// Setting the qualities of the BioSequence.
|
||||||
@ -428,7 +429,8 @@ func (s *BioSequence) SetQualities(qualities Quality) {
|
|||||||
if s.qualities != nil {
|
if s.qualities != nil {
|
||||||
RecycleSlice(&s.qualities)
|
RecycleSlice(&s.qualities)
|
||||||
}
|
}
|
||||||
s.qualities = qualities
|
s.qualities = GetSlice(len(qualities))[0:len(qualities)]
|
||||||
|
copy(s.qualities, qualities)
|
||||||
}
|
}
|
||||||
|
|
||||||
// A method that appends a byte slice to the qualities of the BioSequence.
|
// A method that appends a byte slice to the qualities of the BioSequence.
|
||||||
|
@ -15,6 +15,17 @@ var _BioSequenceByteSlicePool = sync.Pool{
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RecycleSlice recycles a byte slice by clearing its contents and returning it
|
||||||
|
// to a pool if it is small enough.
|
||||||
|
//
|
||||||
|
// Parameters: - s: a pointer to a byte slice that will be recycled.
|
||||||
|
//
|
||||||
|
// This function first checks if the input slice is not nil and has a non-zero
|
||||||
|
// capacity. If so, it clears the contents of the slice by setting its length to
|
||||||
|
// 0. Then, it checks if the capacity of the slice is less than or equal to
|
||||||
|
// 1024. If it is, the function puts the slice into a pool for reuse. If the
|
||||||
|
// capacity is 0 or greater than 1024, the function does nothing. If the input
|
||||||
|
// slice is nil or has a zero capacity, the function logs a panic message.
|
||||||
func RecycleSlice(s *[]byte) {
|
func RecycleSlice(s *[]byte) {
|
||||||
if s != nil && cap(*s) > 0 {
|
if s != nil && cap(*s) > 0 {
|
||||||
*s = (*s)[:0]
|
*s = (*s)[:0]
|
||||||
@ -27,9 +38,22 @@ func RecycleSlice(s *[]byte) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// It returns a slice of bytes from a pool of slices.
|
// GetSlice returns a byte slice with the specified capacity.
|
||||||
//
|
//
|
||||||
// the slice can be prefilled with the provided values
|
// The function first checks if the capacity is less than or equal to 1024. If it is,
|
||||||
|
// it retrieves a byte slice from the _BioSequenceByteSlicePool. If the retrieved
|
||||||
|
// slice is nil, has a nil underlying array, or has a capacity less than the
|
||||||
|
// specified capacity, a new byte slice is created with the specified capacity.
|
||||||
|
// If the capacity is greater than 1024, a new byte slice is created with the
|
||||||
|
// specified capacity.
|
||||||
|
//
|
||||||
|
// The function returns the byte slice.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - capacity: the desired capacity of the byte slice.
|
||||||
|
//
|
||||||
|
// Return type:
|
||||||
|
// - []byte: the byte slice with the specified capacity.
|
||||||
func GetSlice(capacity int) []byte {
|
func GetSlice(capacity int) []byte {
|
||||||
p := (*[]byte)(nil)
|
p := (*[]byte)(nil)
|
||||||
if capacity <= 1024 {
|
if capacity <= 1024 {
|
||||||
|
Reference in New Issue
Block a user