Patch a bug in the fasta and fastq readers

Former-commit-id: 4998f157a90a6b077124d87d4a5cde0dd075d1ce
2025-06-29 16:20:46 +00:00 · 2023-10-13 14:21:27 +02:00
parent 6acce603a1
commit 157c26cdc7
2 changed files with 28 additions and 8 deletions
--- a/pkg/obiformats/fastaseq_read.go
+++ b/pkg/obiformats/fastaseq_read.go
@ -176,6 +176,7 @@ func ParseFastaChunk(source string, ch FastxChunk) *obiiter.BioSequenceBatch {
 		case 1:
 			if is_sep {
 				// No identifier -> ERROR
 				log.Errorf("%s : sequence entry does not have an identifier", source)
 				return nil
 			} else {
 				// Beginning of identifier
@ -188,6 +189,11 @@ func ParseFastaChunk(source string, ch FastxChunk) *obiiter.BioSequenceBatch {
 				identifier = string(ch.Bytes[start:i])
 				state = 3
 			}
 			if is_end_of_line {
 				// Definition empty
 				definition = ""
 				state = 5
 			}
 		case 3:
 			if is_end_of_line {
 				// Definition empty
--- a/pkg/obiformats/fastqseq_read.go
+++ b/pkg/obiformats/fastqseq_read.go
@ -169,13 +169,19 @@ func ParseFastqChunk(source string, ch FastxChunk, quality_shift byte) *obiiter.
 		is_space := C == ' ' || C == '\t'
 		is_sep := is_space || is_end_of_line
 		// log.Infof("%s : state = %d pos = %d character = %c (%d)", source, state, i, C, C)
 		switch state {
-		case 0:
+		case 0: // Beginning of sequence chunk must start with @
 			if C == '@' {
 				// Beginning of sequence
 				state = 1
 			} else {
 				log.Errorf("%s : sequence entry is not starting with @", source)
 				return nil
 			}
-		case 1:
+		case 1: // Beginning of identifier (Mandatory)
 			if is_sep {
 				// No identifier -> ERROR
 				log.Errorf("%s : sequence identifier is empty", source)
@ -185,13 +191,18 @@ func ParseFastqChunk(source string, ch FastxChunk, quality_shift byte) *obiiter.
 				state = 2
 				start = i
 			}
-		case 2:
+		case 2: // Following of the identifier
 			if is_sep {
 				// End of identifier
 				identifier = string(ch.Bytes[start:i])
 				state = 3
 			}
-		case 3:
+			if is_end_of_line {
 				// Definition empty
 				definition = ""
 				state = 5
 			}
 		case 3: // Beginning of definition
 			if is_end_of_line {
 				// Definition empty
 				definition = ""
@ -201,13 +212,12 @@ func ParseFastqChunk(source string, ch FastxChunk, quality_shift byte) *obiiter.
 				start = i
 				state = 4
 			}
-		case 4:
+		case 4: // Following of the definition
 			if is_end_of_line {
 				definition = string(ch.Bytes[start:i])
 				state = 5
 			}
-		case 5:
+		case 5: // Beginning of sequence
 			if !is_end_of_line {
 				// Beginning of sequence
 				start = i
@ -236,7 +246,11 @@ func ParseFastqChunk(source string, ch FastxChunk, quality_shift byte) *obiiter.
 			} else if C == '+' {
 				state = 8
 			} else {
-				log.Errorf("%s[%s] : sequence data not followed by a line starting with +", identifier, source)
+				log.Info(ch.Bytes[0:i])
 				log.Info(string(ch.Bytes[0:i]))
 				log.Info(C)
 				log.Errorf("@%s[%s] : sequence data not followed by a line starting with +", identifier, source)
 				return nil // Error
 			}
 		case 8: