mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds more format checking during fasta parsing
Former-commit-id: fbc3d9c923936287a591f01f9401b710b584aa14
This commit is contained in:
@ -59,7 +59,17 @@ func _ParseFastaFile(source string,
|
|||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
scanner := bufio.NewReader(chunks.raw)
|
scanner := bufio.NewReader(chunks.raw)
|
||||||
|
start, _ := scanner.Peek(20)
|
||||||
|
if start[0] != '>' {
|
||||||
|
log.Fatalf("%s : first character is not '>'", string(start))
|
||||||
|
}
|
||||||
|
if start[1] == ' ' {
|
||||||
|
log.Fatalf("%s :Strange", string(start))
|
||||||
|
}
|
||||||
sequences := make(obiseq.BioSequenceSlice, 0, batch_size)
|
sequences := make(obiseq.BioSequenceSlice, 0, batch_size)
|
||||||
|
|
||||||
|
previous := byte(0)
|
||||||
|
|
||||||
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
|
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
|
||||||
|
|
||||||
is_end_of_line := C == '\r' || C == '\n'
|
is_end_of_line := C == '\r' || C == '\n'
|
||||||
@ -71,11 +81,18 @@ func _ParseFastaFile(source string,
|
|||||||
if C == '>' {
|
if C == '>' {
|
||||||
// Beginning of sequence
|
// Beginning of sequence
|
||||||
state = 1
|
state = 1
|
||||||
|
} else {
|
||||||
|
// ERROR
|
||||||
|
log.Fatalf("%s : sequence entry does not start with '>'", source)
|
||||||
}
|
}
|
||||||
case 1:
|
case 1:
|
||||||
if is_sep {
|
if is_sep {
|
||||||
// No identifier -> ERROR
|
// No identifier -> ERROR
|
||||||
log.Errorf("%s : sequence entry does not have an identifier", source)
|
|
||||||
|
context, _ := scanner.Peek(30)
|
||||||
|
context = append([]byte{C}, context...)
|
||||||
|
log.Fatalf("%s [%s]: sequence entry does not have an identifier",
|
||||||
|
source, string(context))
|
||||||
} else {
|
} else {
|
||||||
// Beginning of identifier
|
// Beginning of identifier
|
||||||
idBytes.Reset()
|
idBytes.Reset()
|
||||||
@ -124,22 +141,39 @@ func _ParseFastaFile(source string,
|
|||||||
|
|
||||||
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
||||||
seqBytes.WriteByte(C)
|
seqBytes.WriteByte(C)
|
||||||
|
} else {
|
||||||
|
context, _ := scanner.Peek(30)
|
||||||
|
context = append(
|
||||||
|
append([]byte{previous}, C),
|
||||||
|
context...)
|
||||||
|
log.Fatalf("%s [%s]: sequence contains invalid character %c (%s)",
|
||||||
|
source, identifier, C, string(context))
|
||||||
}
|
}
|
||||||
state = 6
|
state = 6
|
||||||
}
|
}
|
||||||
case 6:
|
case 6:
|
||||||
if C == '>' {
|
if C == '>' {
|
||||||
// End of sequence
|
if previous == '\r' || previous == '\n' {
|
||||||
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
|
// End of sequence
|
||||||
s.SetSource(source)
|
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
|
||||||
sequences = append(sequences, s)
|
s.SetSource(source)
|
||||||
if no_order {
|
sequences = append(sequences, s)
|
||||||
if len(sequences) == batch_size {
|
if no_order {
|
||||||
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
if len(sequences) == batch_size {
|
||||||
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
|
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
||||||
|
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
state = 1
|
||||||
|
} else {
|
||||||
|
// Error
|
||||||
|
context, _ := scanner.Peek(30)
|
||||||
|
context = append(
|
||||||
|
append([]byte{previous}, C),
|
||||||
|
context...)
|
||||||
|
log.Fatalf("%s [%s]: sequence cannot contain '>' in the middle (%s)",
|
||||||
|
source, identifier, string(context))
|
||||||
}
|
}
|
||||||
state = 1
|
|
||||||
|
|
||||||
} else if !is_sep {
|
} else if !is_sep {
|
||||||
if C >= 'A' && C <= 'Z' {
|
if C >= 'A' && C <= 'Z' {
|
||||||
@ -148,9 +182,19 @@ func _ParseFastaFile(source string,
|
|||||||
// Removing white space from the sequence
|
// Removing white space from the sequence
|
||||||
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
if (C >= 'a' && C <= 'z') || C == '-' || C == '.' || C == '[' || C == ']' {
|
||||||
seqBytes.WriteByte(C)
|
seqBytes.WriteByte(C)
|
||||||
|
} else {
|
||||||
|
context, _ := scanner.Peek(30)
|
||||||
|
context = append(
|
||||||
|
append([]byte{previous}, C),
|
||||||
|
context...)
|
||||||
|
log.Fatalf("%s [%s]: sequence contains invalid character %c (%s)",
|
||||||
|
source, identifier, C, string(context))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
previous = C
|
||||||
}
|
}
|
||||||
|
|
||||||
if state == 6 {
|
if state == 6 {
|
||||||
|
Reference in New Issue
Block a user