From 3e1d9a41ecf13c18e15cfb5778145fcaebbcaf08 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 30 May 2024 18:12:06 +0200 Subject: [PATCH] Adds more format checking during fasta parsing Former-commit-id: fbc3d9c923936287a591f01f9401b710b584aa14 --- pkg/obiformats/fastaseq_read.go | 64 +++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index 10038b5..46b6235 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -59,7 +59,17 @@ func _ParseFastaFile(source string, for chunks := range input { scanner := bufio.NewReader(chunks.raw) + start, _ := scanner.Peek(20) + if start[0] != '>' { + log.Fatalf("%s : first character is not '>'", string(start)) + } + if start[1] == ' ' { + log.Fatalf("%s :Strange", string(start)) + } sequences := make(obiseq.BioSequenceSlice, 0, batch_size) + + previous := byte(0) + for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() { is_end_of_line := C == '\r' || C == '\n' @@ -71,11 +81,18 @@ func _ParseFastaFile(source string, if C == '>' { // Beginning of sequence state = 1 + } else { + // ERROR + log.Fatalf("%s : sequence entry does not start with '>'", source) } case 1: if is_sep { // No identifier -> ERROR - log.Errorf("%s : sequence entry does not have an identifier", source) + + context, _ := scanner.Peek(30) + context = append([]byte{C}, context...) + log.Fatalf("%s [%s]: sequence entry does not have an identifier", + source, string(context)) } else { // Beginning of identifier idBytes.Reset() @@ -124,22 +141,39 @@ func _ParseFastaFile(source string, if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' { seqBytes.WriteByte(C) + } else { + context, _ := scanner.Peek(30) + context = append( + append([]byte{previous}, C), + context...) + log.Fatalf("%s [%s]: sequence contains invalid character %c (%s)", + source, identifier, C, string(context)) } state = 6 } case 6: if C == '>' { - // End of sequence - s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition) - s.SetSource(source) - sequences = append(sequences, s) - if no_order { - if len(sequences) == batch_size { - out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) - sequences = make(obiseq.BioSequenceSlice, 0, batch_size) + if previous == '\r' || previous == '\n' { + // End of sequence + s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition) + s.SetSource(source) + sequences = append(sequences, s) + if no_order { + if len(sequences) == batch_size { + out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) + sequences = make(obiseq.BioSequenceSlice, 0, batch_size) + } } + state = 1 + } else { + // Error + context, _ := scanner.Peek(30) + context = append( + append([]byte{previous}, C), + context...) + log.Fatalf("%s [%s]: sequence cannot contain '>' in the middle (%s)", + source, identifier, string(context)) } - state = 1 } else if !is_sep { if C >= 'A' && C <= 'Z' { @@ -148,9 +182,19 @@ func _ParseFastaFile(source string, // Removing white space from the sequence if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' { seqBytes.WriteByte(C) + } else { + context, _ := scanner.Peek(30) + context = append( + append([]byte{previous}, C), + context...) + log.Fatalf("%s [%s]: sequence contains invalid character %c (%s)", + source, identifier, C, string(context)) } } + } + + previous = C } if state == 6 {