Adds more format checking during fasta parsing

Former-commit-id: fbc3d9c923936287a591f01f9401b710b584aa14
This commit is contained in:
Eric Coissac
2024-05-30 18:12:06 +02:00
parent dd9307a4cd
commit 3e1d9a41ec

View File

@ -59,7 +59,17 @@ func _ParseFastaFile(source string,
for chunks := range input { for chunks := range input {
scanner := bufio.NewReader(chunks.raw) scanner := bufio.NewReader(chunks.raw)
start, _ := scanner.Peek(20)
if start[0] != '>' {
log.Fatalf("%s : first character is not '>'", string(start))
}
if start[1] == ' ' {
log.Fatalf("%s :Strange", string(start))
}
sequences := make(obiseq.BioSequenceSlice, 0, batch_size) sequences := make(obiseq.BioSequenceSlice, 0, batch_size)
previous := byte(0)
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() { for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
is_end_of_line := C == '\r' || C == '\n' is_end_of_line := C == '\r' || C == '\n'
@ -71,11 +81,18 @@ func _ParseFastaFile(source string,
if C == '>' { if C == '>' {
// Beginning of sequence // Beginning of sequence
state = 1 state = 1
} else {
// ERROR
log.Fatalf("%s : sequence entry does not start with '>'", source)
} }
case 1: case 1:
if is_sep { if is_sep {
// No identifier -> ERROR // No identifier -> ERROR
log.Errorf("%s : sequence entry does not have an identifier", source)
context, _ := scanner.Peek(30)
context = append([]byte{C}, context...)
log.Fatalf("%s [%s]: sequence entry does not have an identifier",
source, string(context))
} else { } else {
// Beginning of identifier // Beginning of identifier
idBytes.Reset() idBytes.Reset()
@ -124,22 +141,39 @@ func _ParseFastaFile(source string,
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' { if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
seqBytes.WriteByte(C) seqBytes.WriteByte(C)
} else {
context, _ := scanner.Peek(30)
context = append(
append([]byte{previous}, C),
context...)
log.Fatalf("%s [%s]: sequence contains invalid character %c (%s)",
source, identifier, C, string(context))
} }
state = 6 state = 6
} }
case 6: case 6:
if C == '>' { if C == '>' {
// End of sequence if previous == '\r' || previous == '\n' {
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition) // End of sequence
s.SetSource(source) s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
sequences = append(sequences, s) s.SetSource(source)
if no_order { sequences = append(sequences, s)
if len(sequences) == batch_size { if no_order {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) if len(sequences) == batch_size {
sequences = make(obiseq.BioSequenceSlice, 0, batch_size) out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
}
} }
state = 1
} else {
// Error
context, _ := scanner.Peek(30)
context = append(
append([]byte{previous}, C),
context...)
log.Fatalf("%s [%s]: sequence cannot contain '>' in the middle (%s)",
source, identifier, string(context))
} }
state = 1
} else if !is_sep { } else if !is_sep {
if C >= 'A' && C <= 'Z' { if C >= 'A' && C <= 'Z' {
@ -148,9 +182,19 @@ func _ParseFastaFile(source string,
// Removing white space from the sequence // Removing white space from the sequence
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' { if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
seqBytes.WriteByte(C) seqBytes.WriteByte(C)
} else {
context, _ := scanner.Peek(30)
context = append(
append([]byte{previous}, C),
context...)
log.Fatalf("%s [%s]: sequence contains invalid character %c (%s)",
source, identifier, C, string(context))
} }
} }
} }
previous = C
} }
if state == 6 { if state == 6 {