pkg/obiformats/embl_read.go

package obiformats

import (
	"bufio"
	"bytes"
	"io"
	"path"
	"strconv"
	"strings"

	log "github.com/sirupsen/logrus"

	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)

// EndOfLastFlatFileEntry finds the index of the last entry in the given byte slice 'buff'
// using a pattern match of the form:
// <CR>?<LF>//<CR>?<LF>
// where <CR> and <LF> are the ASCII codes for carriage return and line feed,
// respectively. The function returns the index of the end of the last entry
// or -1 if no match is found.
//
// Arguments:
// buff []byte - a byte slice to search for the end of the last entry
//
// Returns:
// int - the index of the end of the last entry or -1 if no match is found.
func EndOfLastFlatFileEntry(buff []byte) int {
	//  6    5  43 2    1
	// <CR>?<LF>//<CR>?<LF>
	var i int
	var state = 0
	var start = 0
	for i = len(buff) - 1; i >= 0 && state < 5; i-- {
		switch state {
		case 0: // outside of the pattern
			if buff[i] == '\n' {
				state = 1
			}
		case 1: // a \n have been matched
			start = i + 2
			switch buff[i] {
			case '\r':
				state = 2
			case '/':
				state = 3
			case '\n':
				state = 1
			default:
				state = 0
			}
		case 2: // a \r have been matched
			switch buff[i] {
			case '/':
				state = 3
			case '\n':
				state = 1
			default:
				state = 0
			}
		case 3: // the first / have been matched
			switch buff[i] {
			case '/':
				state = 4
			case '\n':
				state = 1
			default:
				state = 0
			}
		case 4: // the second / have been matched
			switch buff[i] {
			case '\n':
				state = 5
			default:
				state = 0
			}
		}

	}

	if i > 0 {
		return start
	}

	return -1
}

func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {
	parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {
		scanner := bufio.NewScanner(input)
		sequences := make(obiseq.BioSequenceSlice, 0, 100)
		id := ""
		scientificName := ""
		defBytes := new(bytes.Buffer)
		featBytes := new(bytes.Buffer)
		seqBytes := new(bytes.Buffer)
		taxid := 1
		for scanner.Scan() {

			line := scanner.Text()

			switch {
			case strings.HasPrefix(line, "ID   "):
				id = strings.SplitN(line[5:], ";", 2)[0]
			case strings.HasPrefix(line, "OS   "):
				scientificName = strings.TrimSpace(line[5:])
			case strings.HasPrefix(line, "DE   "):
				if defBytes.Len() > 0 {
					defBytes.WriteByte(' ')
				}
				defBytes.WriteString(strings.TrimSpace(line[5:]))
			case withFeatureTable && strings.HasPrefix(line, "FH   "):
				featBytes.WriteString(line)
			case withFeatureTable && line == "FH":
				featBytes.WriteByte('\n')
				featBytes.WriteString(line)
			case strings.HasPrefix(line, "FT   "):
				if withFeatureTable {
					featBytes.WriteByte('\n')
					featBytes.WriteString(line)
				}
				if strings.HasPrefix(line, `FT                   /db_xref="taxon:`) {
					taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
				}
			case strings.HasPrefix(line, "     "):
				parts := strings.SplitN(line[5:], " ", 7)
				np := len(parts) - 1
				for i := 0; i < np; i++ {
					if UtoT {
						parts[i] = strings.ReplaceAll(parts[i], "u", "t")
					}
					seqBytes.WriteString(parts[i])
				}
			case line == "//":
				sequence := obiseq.NewBioSequence(id,
					seqBytes.Bytes(),
					defBytes.String())
				sequence.SetSource(source)

				if withFeatureTable {
					sequence.SetFeatures(featBytes.Bytes())
				}

				annot := sequence.Annotations()
				annot["scientific_name"] = scientificName
				annot["taxid"] = taxid
				// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
				sequences = append(sequences, sequence)
				defBytes = new(bytes.Buffer)
				featBytes = new(bytes.Buffer)
				seqBytes = new(bytes.Buffer)
			}
		}

		return sequences, nil

	}

	return parser
}

// extractEmblSeq scans the sequence section of an EMBL record directly on the
// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of
// 10, separated by spaces, with a position number at the end. The section ends
// with "//".
func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {
	// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).
	for {
		line := s.ReadLine()
		if line == nil {
			break
		}
		if len(line) >= 2 && line[0] == '/' && line[1] == '/' {
			break
		}
		// Lines start with 5 spaces; bases follow separated by single spaces.
		// Digits at the end are the position counter — skip them.
		// Simplest: take every byte that is a letter.
		for _, b := range line {
			if b >= 'A' && b <= 'Z' {
				b += 'a' - 'A'
			}
			if UtoT && b == 'u' {
				b = 't'
			}
			if b >= 'a' && b <= 'z' {
				dest = append(dest, b)
			}
		}
	}
	return dest
}

// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().
func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
	scanner := newRopeScanner(rope)
	sequences := obiseq.MakeBioSequenceSlice(100)[:0]

	var id string
	var scientificName string
	defBytes := make([]byte, 0, 256)
	featBytes := make([]byte, 0, 1024)
	var taxid int
	inSeq := false

	for {
		line := scanner.ReadLine()
		if line == nil {
			break
		}

		if inSeq {
			// Should not happen — extractEmblSeq consumed up to "//"
			inSeq = false
			continue
		}

		switch {
		case bytes.HasPrefix(line, []byte("ID   ")):
			id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])
		case bytes.HasPrefix(line, []byte("OS   ")):
			scientificName = string(bytes.TrimSpace(line[5:]))
		case bytes.HasPrefix(line, []byte("DE   ")):
			if len(defBytes) > 0 {
				defBytes = append(defBytes, ' ')
			}
			defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)
		case withFeatureTable && bytes.HasPrefix(line, []byte("FH   ")):
			featBytes = append(featBytes, line...)
		case withFeatureTable && bytes.Equal(line, []byte("FH")):
			featBytes = append(featBytes, '\n')
			featBytes = append(featBytes, line...)
		case bytes.HasPrefix(line, []byte("FT   ")):
			if withFeatureTable {
				featBytes = append(featBytes, '\n')
				featBytes = append(featBytes, line...)
			}
			if bytes.HasPrefix(line, []byte(`FT                   /db_xref="taxon:`)) {
				rest := line[37:]
				end := bytes.IndexByte(rest, '"')
				if end > 0 {
					taxid, _ = strconv.Atoi(string(rest[:end]))
				}
			}
		case bytes.HasPrefix(line, []byte("     ")):
			// First sequence line: extract all bases via extractEmblSeq,
			// which also consumes this line's remaining content.
			// But ReadLine already consumed this line — we need to process it
			// plus subsequent lines. Process this line inline then call helper.
			seqDest := make([]byte, 0, 4096)
			for _, b := range line {
				if b >= 'A' && b <= 'Z' {
					b += 'a' - 'A'
				}
				if UtoT && b == 'u' {
					b = 't'
				}
				if b >= 'a' && b <= 'z' {
					seqDest = append(seqDest, b)
				}
			}
			seqDest = scanner.extractEmblSeq(seqDest, UtoT)

			seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))
			seq.SetSource(source)
			if withFeatureTable {
				seq.SetFeatures(featBytes)
			}
			annot := seq.Annotations()
			annot["scientific_name"] = scientificName
			annot["taxid"] = taxid
			sequences = append(sequences, seq)

			// Reset state
			id = ""
			scientificName = ""
			defBytes = defBytes[:0]
			featBytes = featBytes[:0]
			taxid = 1

		case bytes.Equal(line, []byte("//")):
			// record ended without SQ/sequence section (e.g. WGS entries)
			if id != "" {
				seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))
				seq.SetSource(source)
				if withFeatureTable {
					seq.SetFeatures(featBytes)
				}
				annot := seq.Annotations()
				annot["scientific_name"] = scientificName
				annot["taxid"] = taxid
				sequences = append(sequences, seq)
			}
			id = ""
			scientificName = ""
			defBytes = defBytes[:0]
			featBytes = featBytes[:0]
			taxid = 1
		}
	}

	return sequences, nil
}

func _ParseEmblFile(
	input ChannelFileChunk,
	out obiiter.IBioSequence,
	withFeatureTable, UtoT bool,
) {

	parser := EmblChunkParser(withFeatureTable, UtoT)

	for chunks := range input {
		order := chunks.Order
		var sequences obiseq.BioSequenceSlice
		var err error

		if chunks.Rope != nil {
			sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
		} else {
			sequences, err = parser(chunks.Source, chunks.Raw)
		}

		if err != nil {
			log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
		}

		out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, order, sequences))
	}

	out.Done()

}

//	6    5  43 2    1
//
// <CR>?<LF>//<CR>?<LF>
func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
	opt := MakeOptions(options)

	entry_channel := ReadFileChunk(
		opt.Source(),
		reader,
		1024*1024*128,
		EndOfLastFlatFileEntry,
		"\nID   ",
		false,
	)

	newIter := obiiter.MakeIBioSequence()

	nworkers := opt.ParallelWorkers()

	// for j := 0; j < opt.ParallelWorkers(); j++ {
	for j := 0; j < nworkers; j++ {
		newIter.Add(1)
		go _ParseEmblFile(
			entry_channel,
			newIter,
			opt.WithFeatureTable(),
			opt.UtoT(),
		)
	}

	go func() {
		newIter.WaitAndClose()
	}()

	if opt.pointer.full_file_batch {
		newIter = newIter.CompleteFileIterator()
	}

	return newIter, nil
}

func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
	var reader io.Reader
	var err error

	options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))

	reader, err = obiutils.Ropen(filename)

	if err == obiutils.ErrNoContent {
		log.Infof("file %s is empty", filename)
		return ReadEmptyFile(options...)
	}

	if err != nil {
		log.Printf("open file error: %+v", err)
		return obiiter.NilIBioSequence, err
	}

	return ReadEMBL(reader, options...)
}
First commit 2022-01-13 23:27:39 +01:00			`package obiformats`

			`import (`
			`"bufio"`
			`"bytes"`
			`"io"`
First attempt for obiconsensus... The graph traversing algorithm is too simple Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba 2023-03-27 19:51:10 +07:00			`"path"`
First commit 2022-01-13 23:27:39 +01:00			`"strconv"`
			`"strings"`

Improved performance and ability to read very long sequences. 2022-08-21 13:38:13 +02:00			`log "github.com/sirupsen/logrus"`

Change path of the obitools pkg Former-commit-id: 311cbf8df3b990b393c6f4885d62e74564423b65 2023-11-29 12:14:37 +01:00			`"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"`
			`"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"`
			`"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"`
First commit 2022-01-13 23:27:39 +01:00			`)`

Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`// EndOfLastFlatFileEntry finds the index of the last entry in the given byte slice 'buff'`
Patch an embl/genbank parser error 2023-02-16 13:30:42 +01:00			`// using a pattern match of the form:`
			`// <CR>?<LF>//<CR>?<LF>`
			`// where <CR> and <LF> are the ASCII codes for carriage return and line feed,`
			`// respectively. The function returns the index of the end of the last entry`
			`// or -1 if no match is found.`
			`//`
			`// Arguments:`
			`// buff []byte - a byte slice to search for the end of the last entry`
			`//`
			`// Returns:`
			`// int - the index of the end of the last entry or -1 if no match is found.`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`func EndOfLastFlatFileEntry(buff []byte) int {`
First commit 2022-01-13 23:27:39 +01:00			`// 6 5 43 2 1`
			`// <CR>?<LF>//<CR>?<LF>`
			`var i int`
			`var state = 0`
			`var start = 0`
			`for i = len(buff) - 1; i >= 0 && state < 5; i-- {`
			`switch state {`
			`case 0: // outside of the pattern`
			`if buff[i] == '\n' {`
			`state = 1`
			`}`
			`case 1: // a \n have been matched`
			`start = i + 2`
			`switch buff[i] {`
			`case '\r':`
			`state = 2`
			`case '/':`
			`state = 3`
			`case '\n':`
			`state = 1`
			`default:`
			`state = 0`
			`}`
			`case 2: // a \r have been matched`
			`switch buff[i] {`
			`case '/':`
			`state = 3`
			`case '\n':`
			`state = 1`
			`default:`
			`state = 0`
			`}`
			`case 3: // the first / have been matched`
			`switch buff[i] {`
			`case '/':`
			`state = 4`
			`case '\n':`
			`state = 1`
			`default:`
			`state = 0`
			`}`
			`case 4: // the second / have been matched`
			`switch buff[i] {`
			`case '\n':`
			`state = 5`
			`default:`
			`state = 0`
			`}`
			`}`

			`}`

			`if i > 0 {`
			`return start`
			`}`
Code refactoring 2022-01-14 17:32:12 +01:00
			`return -1`
First commit 2022-01-13 23:27:39 +01:00			`}`

Add a reading option on readers to convet U to T 2025-07-07 15:29:07 +02:00			`func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlice, error) {`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`parser := func(source string, input io.Reader) (obiseq.BioSequenceSlice, error) {`
			`scanner := bufio.NewScanner(input)`
First commit 2022-01-13 23:27:39 +01:00			`sequences := make(obiseq.BioSequenceSlice, 0, 100)`
			`id := ""`
Code refactoring 2022-01-14 17:32:12 +01:00			`scientificName := ""`
			`defBytes := new(bytes.Buffer)`
			`featBytes := new(bytes.Buffer)`
			`seqBytes := new(bytes.Buffer)`
First commit 2022-01-13 23:27:39 +01:00			`taxid := 1`
			`for scanner.Scan() {`

			`line := scanner.Text()`

			`switch {`
			`case strings.HasPrefix(line, "ID "):`
			`id = strings.SplitN(line[5:], ";", 2)[0]`
			`case strings.HasPrefix(line, "OS "):`
Code refactoring 2022-01-14 17:32:12 +01:00			`scientificName = strings.TrimSpace(line[5:])`
First commit 2022-01-13 23:27:39 +01:00			`case strings.HasPrefix(line, "DE "):`
Code refactoring 2022-01-14 17:32:12 +01:00			`if defBytes.Len() > 0 {`
			`defBytes.WriteByte(' ')`
First commit 2022-01-13 23:27:39 +01:00			`}`
Code refactoring 2022-01-14 17:32:12 +01:00			`defBytes.WriteString(strings.TrimSpace(line[5:]))`
Work on EMBL and Genbank parser efficienct Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07 2024-02-20 13:23:07 +01:00			`case withFeatureTable && strings.HasPrefix(line, "FH "):`
Code refactoring 2022-01-14 17:32:12 +01:00			`featBytes.WriteString(line)`
Work on EMBL and Genbank parser efficienct Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07 2024-02-20 13:23:07 +01:00			`case withFeatureTable && line == "FH":`
Code refactoring 2022-01-14 17:32:12 +01:00			`featBytes.WriteByte('\n')`
			`featBytes.WriteString(line)`
First commit 2022-01-13 23:27:39 +01:00			`case strings.HasPrefix(line, "FT "):`
Work on EMBL and Genbank parser efficienct Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07 2024-02-20 13:23:07 +01:00			`if withFeatureTable {`
			`featBytes.WriteByte('\n')`
			`featBytes.WriteString(line)`
			`}`
First commit 2022-01-13 23:27:39 +01:00			if strings.HasPrefix(line, `FT /db_xref="taxon:`) {
			taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
			`}`
			`case strings.HasPrefix(line, " "):`
			`parts := strings.SplitN(line[5:], " ", 7)`
Correct the bug in embl reader Former-commit-id: 579d397ca16e8c4cf2f8ba01e503e62b2fffa06f 2024-01-31 15:50:14 +01:00			`np := len(parts) - 1`
Patch a bug in the embl reader and adds some doc Former-commit-id: 9b5f75fb14bcc3043da1647055279987a295d271 2024-01-31 15:43:02 +01:00			`for i := 0; i < np; i++ {`
Add a reading option on readers to convet U to T 2025-07-07 15:29:07 +02:00			`if UtoT {`
			`parts[i] = strings.ReplaceAll(parts[i], "u", "t")`
			`}`
Code refactoring 2022-01-14 17:32:12 +01:00			`seqBytes.WriteString(parts[i])`
First commit 2022-01-13 23:27:39 +01:00			`}`
			`case line == "//":`
Big change iin the data model, and a first version of obiuniq 2022-02-21 19:00:23 +01:00			`sequence := obiseq.NewBioSequence(id,`
Reduce redundante call to bytes.ToLower and substitute the last call by an home made version doing the conversion in place Former-commit-id: d9ea22f649d97be352f8dbb37acc1495df830118 2023-03-28 11:43:04 +07:00			`seqBytes.Bytes(),`
Code refactoring 2022-01-14 17:32:12 +01:00			`defBytes.String())`
First attempt for obiconsensus... The graph traversing algorithm is too simple Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba 2023-03-27 19:51:10 +07:00			`sequence.SetSource(source)`
Reduce redundante call to bytes.ToLower and substitute the last call by an home made version doing the conversion in place Former-commit-id: d9ea22f649d97be352f8dbb37acc1495df830118 2023-03-28 11:43:04 +07:00
Work on EMBL and Genbank parser efficienct Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07 2024-02-20 13:23:07 +01:00			`if withFeatureTable {`
			`sequence.SetFeatures(featBytes.Bytes())`
			`}`
First commit 2022-01-13 23:27:39 +01:00
			`annot := sequence.Annotations()`
Code refactoring 2022-01-14 17:32:12 +01:00			`annot["scientific_name"] = scientificName`
First commit 2022-01-13 23:27:39 +01:00			`annot["taxid"] = taxid`
			`// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))`
			`sequences = append(sequences, sequence)`
Code refactoring 2022-01-14 17:32:12 +01:00			`defBytes = new(bytes.Buffer)`
			`featBytes = new(bytes.Buffer)`
			`seqBytes = new(bytes.Buffer)`
First commit 2022-01-13 23:27:39 +01:00			`}`
			`}`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00
			`return sequences, nil`

			`}`

			`return parser`
			`}`

Add EMBL rope parsing support and improve sequence extraction Introduce EmblChunkParserRope function to parse EMBL chunks directly from a rope without using Pack(). Add extractEmblSeq helper to scan sequence sections and handle U to T conversion. Update parser logic to use rope-based parsing when available, and fix feature table handling for WGS entries. 2026-03-10 17:02:05 +01:00			`// extractEmblSeq scans the sequence section of an EMBL record directly on the`
			`// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of`
			`// 10, separated by spaces, with a position number at the end. The section ends`
			`// with "//".`
			`func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {`
			`// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).`
			`for {`
			`line := s.ReadLine()`
			`if line == nil {`
			`break`
			`}`
			`if len(line) >= 2 && line[0] == '/' && line[1] == '/' {`
			`break`
			`}`
			`// Lines start with 5 spaces; bases follow separated by single spaces.`
			`// Digits at the end are the position counter — skip them.`
			`// Simplest: take every byte that is a letter.`
			`for _, b := range line {`
			`if b >= 'A' && b <= 'Z' {`
			`b += 'a' - 'A'`
			`}`
			`if UtoT && b == 'u' {`
			`b = 't'`
			`}`
			`if b >= 'a' && b <= 'z' {`
			`dest = append(dest, b)`
			`}`
			`}`
			`}`
			`return dest`
			`}`

			`// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().`
			`func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {`
			`scanner := newRopeScanner(rope)`
			`sequences := obiseq.MakeBioSequenceSlice(100)[:0]`

			`var id string`
			`var scientificName string`
			`defBytes := make([]byte, 0, 256)`
			`featBytes := make([]byte, 0, 1024)`
			`var taxid int`
			`inSeq := false`

			`for {`
			`line := scanner.ReadLine()`
			`if line == nil {`
			`break`
			`}`

			`if inSeq {`
			`// Should not happen — extractEmblSeq consumed up to "//"`
			`inSeq = false`
			`continue`
			`}`

			`switch {`
			`case bytes.HasPrefix(line, []byte("ID ")):`
			`id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])`
			`case bytes.HasPrefix(line, []byte("OS ")):`
			`scientificName = string(bytes.TrimSpace(line[5:]))`
			`case bytes.HasPrefix(line, []byte("DE ")):`
			`if len(defBytes) > 0 {`
			`defBytes = append(defBytes, ' ')`
			`}`
			`defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)`
			`case withFeatureTable && bytes.HasPrefix(line, []byte("FH ")):`
			`featBytes = append(featBytes, line...)`
			`case withFeatureTable && bytes.Equal(line, []byte("FH")):`
			`featBytes = append(featBytes, '\n')`
			`featBytes = append(featBytes, line...)`
			`case bytes.HasPrefix(line, []byte("FT ")):`
			`if withFeatureTable {`
			`featBytes = append(featBytes, '\n')`
			`featBytes = append(featBytes, line...)`
			`}`
			if bytes.HasPrefix(line, []byte(`FT /db_xref="taxon:`)) {
			`rest := line[37:]`
			`end := bytes.IndexByte(rest, '"')`
			`if end > 0 {`
			`taxid, _ = strconv.Atoi(string(rest[:end]))`
			`}`
			`}`
			`case bytes.HasPrefix(line, []byte(" ")):`
			`// First sequence line: extract all bases via extractEmblSeq,`
			`// which also consumes this line's remaining content.`
			`// But ReadLine already consumed this line — we need to process it`
			`// plus subsequent lines. Process this line inline then call helper.`
			`seqDest := make([]byte, 0, 4096)`
			`for _, b := range line {`
			`if b >= 'A' && b <= 'Z' {`
			`b += 'a' - 'A'`
			`}`
			`if UtoT && b == 'u' {`
			`b = 't'`
			`}`
			`if b >= 'a' && b <= 'z' {`
			`seqDest = append(seqDest, b)`
			`}`
			`}`
			`seqDest = scanner.extractEmblSeq(seqDest, UtoT)`

			`seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))`
			`seq.SetSource(source)`
			`if withFeatureTable {`
			`seq.SetFeatures(featBytes)`
			`}`
			`annot := seq.Annotations()`
			`annot["scientific_name"] = scientificName`
			`annot["taxid"] = taxid`
			`sequences = append(sequences, seq)`

			`// Reset state`
			`id = ""`
			`scientificName = ""`
			`defBytes = defBytes[:0]`
			`featBytes = featBytes[:0]`
			`taxid = 1`

			`case bytes.Equal(line, []byte("//")):`
			`// record ended without SQ/sequence section (e.g. WGS entries)`
			`if id != "" {`
			`seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))`
			`seq.SetSource(source)`
			`if withFeatureTable {`
			`seq.SetFeatures(featBytes)`
			`}`
			`annot := seq.Annotations()`
			`annot["scientific_name"] = scientificName`
			`annot["taxid"] = taxid`
			`sequences = append(sequences, seq)`
			`}`
			`id = ""`
			`scientificName = ""`
			`defBytes = defBytes[:0]`
			`featBytes = featBytes[:0]`
			`taxid = 1`
			`}`
			`}`

			`return sequences, nil`
			`}`

Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`func _ParseEmblFile(`
refactoring of the file chunck writing 2024-11-29 18:15:03 +01:00			`input ChannelFileChunk,`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`out obiiter.IBioSequence,`
Add a reading option on readers to convet U to T 2025-07-07 15:29:07 +02:00			`withFeatureTable, UtoT bool,`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`) {`

Add a reading option on readers to convet U to T 2025-07-07 15:29:07 +02:00			`parser := EmblChunkParser(withFeatureTable, UtoT)`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00
			`for chunks := range input {`
			`order := chunks.Order`
Add EMBL rope parsing support and improve sequence extraction Introduce EmblChunkParserRope function to parse EMBL chunks directly from a rope without using Pack(). Add extractEmblSeq helper to scan sequence sections and handle U to T conversion. Update parser logic to use rope-based parsing when available, and fix feature table handling for WGS entries. 2026-03-10 17:02:05 +01:00			`var sequences obiseq.BioSequenceSlice`
			`var err error`

			`if chunks.Rope != nil {`
			`sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)`
			`} else {`
			`sequences, err = parser(chunks.Source, chunks.Raw)`
			`}`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00
			`if err != nil {`
			`log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)`
			`}`

			`out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, order, sequences))`
First commit 2022-01-13 23:27:39 +01:00			`}`

			`out.Done()`

			`}`

Improved performance and ability to read very long sequences. 2022-08-21 13:38:13 +02:00			`// 6 5 43 2 1`
			`//`
First commit 2022-01-13 23:27:39 +01:00			`// <CR>?<LF>//<CR>?<LF>`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {`
First commit 2022-01-13 23:27:39 +01:00			`opt := MakeOptions(options)`

refactoring of the file chunck writing 2024-11-29 18:15:03 +01:00			`entry_channel := ReadFileChunk(`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`opt.Source(),`
			`reader,`
Accelerate the speed of very long fasta sequences, and more generaly of every format 2025-03-12 13:29:41 +01:00			`10241024128,`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`EndOfLastFlatFileEntry,`
Accelerate the speed of very long fasta sequences, and more generaly of every format 2025-03-12 13:29:41 +01:00			`"\nID ",`
Add EMBL rope parsing support and improve sequence extraction Introduce EmblChunkParserRope function to parse EMBL chunks directly from a rope without using Pack(). Add extractEmblSeq helper to scan sequence sections and handle U to T conversion. Update parser logic to use rope-based parsing when available, and fix feature table handling for WGS entries. 2026-03-10 17:02:05 +01:00			`false,`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`)`

Refactoring codes for removing buffer size options. An some other changes... Former-commit-id: 10b57cc1a27446ade3c444217341e9651e89cdce 2023-03-07 11:12:13 +07:00			`newIter := obiiter.MakeIBioSequence()`
First commit 2022-01-13 23:27:39 +01:00
Adds a reader for NGS filter files and change some API for the apat library 2022-01-18 13:09:32 +01:00			`nworkers := opt.ParallelWorkers()`
First commit 2022-01-13 23:27:39 +01:00
			`// for j := 0; j < opt.ParallelWorkers(); j++ {`
Adds a reader for NGS filter files and change some API for the apat library 2022-01-18 13:09:32 +01:00			`for j := 0; j < nworkers; j++ {`
Refactor sequence file reading Former-commit-id: 3dcb96e68da648d72bb585da047e3496427d7851 2024-05-01 00:50:23 +02:00			`newIter.Add(1)`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`go _ParseEmblFile(`
			`entry_channel,`
			`newIter,`
Work on EMBL and Genbank parser efficienct Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07 2024-02-20 13:23:07 +01:00			`opt.WithFeatureTable(),`
Add a reading option on readers to convet U to T 2025-07-07 15:29:07 +02:00			`opt.UtoT(),`
Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`)`
First commit 2022-01-13 23:27:39 +01:00			`}`

Refactor sequence file reading Former-commit-id: 3dcb96e68da648d72bb585da047e3496427d7851 2024-05-01 00:50:23 +02:00			`go func() {`
			`newIter.WaitAndClose()`
			`}()`
First commit 2022-01-13 23:27:39 +01:00
First attempt for obiconsensus... The graph traversing algorithm is too simple Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba 2023-03-27 19:51:10 +07:00			`if opt.pointer.full_file_batch {`
A go implementation of the fasta reader Former-commit-id: 603592c4761fb0722e9e0501d78de1bd3ba238fa 2023-09-01 09:30:12 +02:00			`newIter = newIter.CompleteFileIterator()`
First attempt for obiconsensus... The graph traversing algorithm is too simple Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba 2023-03-27 19:51:10 +07:00			`}`

Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`return newIter, nil`
First commit 2022-01-13 23:27:39 +01:00			`}`

rename the iterator class 2023-01-22 22:04:17 +01:00			`func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {`
First commit 2022-01-13 23:27:39 +01:00			`var reader io.Reader`
			`var err error`

First attempt for obiconsensus... The graph traversing algorithm is too simple Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba 2023-03-27 19:51:10 +07:00			`options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))`

Adds the ability to read gzip-tar file for the taxonomy dump 2025-01-24 11:47:59 +01:00			`reader, err = obiutils.Ropen(filename)`
Correctly handle empty files Former-commit-id: d166aa352ce4bf32739ddc2f7d1c9967918822fd 2023-10-16 15:34:06 +02:00
Adds the ability to read gzip-tar file for the taxonomy dump 2025-01-24 11:47:59 +01:00			`if err == obiutils.ErrNoContent {`
Correctly handle empty files Former-commit-id: d166aa352ce4bf32739ddc2f7d1c9967918822fd 2023-10-16 15:34:06 +02:00			`log.Infof("file %s is empty", filename)`
			`return ReadEmptyFile(options...)`
			`}`

First commit 2022-01-13 23:27:39 +01:00			`if err != nil {`
			`log.Printf("open file error: %+v", err)`
rename the iterator class 2023-01-22 22:04:17 +01:00			`return obiiter.NilIBioSequence, err`
First commit 2022-01-13 23:27:39 +01:00			`}`

Add some code refactoring from the blackboard branch 2024-08-02 12:35:46 +02:00			`return ReadEMBL(reader, options...)`
First commit 2022-01-13 23:27:39 +01:00			`}`