Correct a bug in the fastq reader affecting the quality of the last record of each chunk

Former-commit-id: b842d60af9c2f1f971946d99999d13cfc15793b3
This commit is contained in:
Eric Coissac
2024-06-04 11:57:16 +02:00
parent 411124d1b3
commit aa42df326a
6 changed files with 104 additions and 42 deletions

View File

@@ -6,7 +6,6 @@ import (
"io"
"os"
"path"
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
@@ -155,7 +154,11 @@ func _ParseFastaFile(source string,
if C == '>' {
if previous == '\r' || previous == '\n' {
// End of sequence
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
rawseq := seqBytes.Bytes()
if len(rawseq) == 0 {
log.Fatalf("@%s[%s] : sequence is empty", identifier, source)
}
s := obiseq.NewBioSequence(identifier, rawseq, definition)
s.SetSource(source)
sequences = append(sequences, s)
if no_order {
@@ -198,17 +201,21 @@ func _ParseFastaFile(source string,
}
if state == 6 {
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
rawseq := seqBytes.Bytes()
if len(rawseq) == 0 {
log.Fatalf("@%s[%s] : sequence is empty", identifier, source)
}
s := obiseq.NewBioSequence(identifier, rawseq, definition)
s.SetSource(source)
sequences = append(sequences, s)
}
if len(sequences) > 0 {
co := chunks.order
if no_order {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
} else {
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
co = chunck_order()
}
out.Push(obiiter.MakeBioSequenceBatch(co, sequences))
}
}

View File

@@ -6,7 +6,6 @@ import (
"io"
"os"
"path"
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
@@ -97,9 +96,27 @@ func _EndOfLastFastqEntry(buffer []byte) int {
if i == 0 || state != 7 {
return -1
}
return cut
}
func _storeSequenceQuality(bytes *bytes.Buffer, out *obiseq.BioSequence, quality_shift byte) {
q := bytes.Bytes()
if len(q) == 0 {
log.Fatalf("@%s[%s] : sequence quality is empty", out.Id(), out.Source())
}
if len(q) != out.Len() {
log.Fatalf("%s[%s] : sequence data and quality lenght not equal (%d <> %d)",
out.Id(), out.Source(), len(q), out.Len())
}
for i := 0; i < len(q); i++ {
q[i] = q[i] - quality_shift
}
out.SetQualities(q)
}
func _ParseFastqFile(source string,
input ChannelSeqFileChunk,
out obiiter.IBioSequence,
@@ -122,6 +139,8 @@ func _ParseFastqFile(source string,
for chunks := range input {
scanner := bufio.NewReader(chunks.raw)
sequences := make(obiseq.BioSequenceSlice, 0, 100)
previous := byte(0)
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
is_end_of_line := C == '\r' || C == '\n'
@@ -135,12 +154,12 @@ func _ParseFastqFile(source string,
// Beginning of sequence
state = 1
} else {
log.Errorf("%s : sequence entry is not starting with @", source)
log.Fatalf("%s : sequence entry is not starting with @", source)
}
case 1: // Beginning of identifier (Mandatory)
if is_sep {
// No identifier -> ERROR
log.Errorf("%s : sequence identifier is empty", source)
log.Fatalf("%s : sequence identifier is empty", source)
} else {
// Beginning of identifier
state = 2
@@ -191,7 +210,11 @@ func _ParseFastqFile(source string,
case 6:
if is_end_of_line {
// End of sequence
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
rawseq := seqBytes.Bytes()
if len(rawseq) == 0 {
log.Fatalf("@%s[%s] : sequence is empty", identifier, source)
}
s := obiseq.NewBioSequence(identifier, rawseq, definition)
s.SetSource(source)
sequences = append(sequences, s)
state = 7
@@ -199,7 +222,16 @@ func _ParseFastqFile(source string,
if C >= 'A' && C <= 'Z' {
C = C + 'a' - 'A'
}
seqBytes.WriteByte(C)
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
seqBytes.WriteByte(C)
} else {
context, _ := scanner.Peek(30)
context = append(
append([]byte{previous}, C),
context...)
log.Fatalf("%s [%s]: sequence contains invalid character %c (%s)",
source, identifier, C, string(context))
}
}
case 7:
if is_end_of_line {
@@ -207,9 +239,10 @@ func _ParseFastqFile(source string,
} else if C == '+' {
state = 8
} else {
log.Errorf("@%s[%s] : sequence data not followed by a line starting with + but a %c", identifier, source, C)
log.Fatalf("@%s[%s] : sequence data not followed by a line starting with + but a %c", identifier, source, C)
}
case 8:
// State consuming the + internal header line
if is_end_of_line {
state = 9
}
@@ -224,16 +257,7 @@ func _ParseFastqFile(source string,
}
case 10:
if is_end_of_line {
// End of quality
q := qualBytes.Bytes()
if len(q) != sequences[len(sequences)-1].Len() {
log.Errorf("%s[%s] : sequence data and quality lenght not equal (%d/%d)",
identifier, source, len(q), sequences[len(sequences)-1].Len())
}
for i := 0; i < len(q); i++ {
q[i] = q[i] - quality_shift
}
sequences[len(sequences)-1].SetQualities(q)
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift)
if no_order {
if len(sequences) == batch_size {
@@ -252,18 +276,25 @@ func _ParseFastqFile(source string,
} else if C == '@' {
state = 1
} else {
log.Errorf("%s[%s] : sequence record not followed by a line starting with @", identifier, source)
log.Fatalf("%s[%s] : sequence record not followed by a line starting with @", identifier, source)
}
}
previous = C
}
if len(sequences) > 0 {
if no_order {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
} else {
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
if state == 10 {
_storeSequenceQuality(qualBytes, sequences[len(sequences)-1], quality_shift)
state = 1
}
co := chunks.order
if no_order {
co = chunck_order()
}
out.Push(obiiter.MakeBioSequenceBatch(co, sequences))
}
}

View File

@@ -91,11 +91,6 @@ func ReadSeqFileChunk(reader io.Reader,
io := bytes.NewBuffer(slices.Clone(buff))
chunk_channel <- SeqFileChunk{io, i}
i++
// if string(buff[io.Len()-2:]) != "//" {
// log.Fatalf("File chunck ends with 3 bytes : %s", io.Bytes()[io.Len()-3:])
// }
}
if lremain > 0 {