From 623116ab13410a46d00ed8ce183702a6dad29ef0 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Mar 2026 16:34:25 +0100 Subject: [PATCH 1/8] Add rope-based FASTA parsing and improve sequence handling Introduce FastaChunkParserRope for direct rope-based FASTA parsing, enhance sequence extraction with whitespace skipping and U->T conversion, and update parser logic to support both rope and raw data sources. - Added extractFastaSeq function to scan sequence bytes directly from rope - Implemented FastaChunkParserRope for rope-based parsing - Modified _ParseFastaFile to use rope when available - Updated sequence handling to support U->T conversion - Fixed line ending detection for FASTA parsing --- pkg/obiformats/fastaseq_read.go | 105 ++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 6 deletions(-) diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index 3597be2..c700ea2 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -209,28 +209,121 @@ func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlic return parser } +// extractFastaSeq scans sequence bytes from the rope directly into dest, +// appending valid nucleotide characters and skipping whitespace. +// Stops when '>' is found at the start of a line (next record) or at EOF. +// Returns (dest with appended bases, hasMore). +// hasMore=true means scanner is now positioned at '>' of the next record. +func (s *gbRopeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) { + lineStart := true + + for s.current != nil { + data := s.current.data[s.pos:] + for i, b := range data { + if lineStart && b == '>' { + s.pos += i + if s.pos >= len(s.current.data) { + s.current = s.current.Next() + s.pos = 0 + } + return dest, true + } + if b == '\n' || b == '\r' { + lineStart = true + continue + } + lineStart = false + if b == ' ' || b == '\t' { + continue + } + if b >= 'A' && b <= 'Z' { + b += 'a' - 'A' + } + if UtoT && b == 'u' { + b = 't' + } + dest = append(dest, b) + } + s.current = s.current.Next() + s.pos = 0 + } + return dest, false +} + +// FastaChunkParserRope parses a FASTA chunk directly from the rope without Pack(). +func FastaChunkParserRope(source string, rope *PieceOfChunk, UtoT bool) (obiseq.BioSequenceSlice, error) { + scanner := newGbRopeScanner(rope) + sequences := obiseq.MakeBioSequenceSlice(100)[:0] + + for { + bline := scanner.ReadLine() + if bline == nil { + break + } + if len(bline) == 0 || bline[0] != '>' { + continue + } + + // Parse header: ">id definition" + header := bline[1:] + var id string + var definition string + sp := bytes.IndexByte(header, ' ') + if sp < 0 { + sp = bytes.IndexByte(header, '\t') + } + if sp < 0 { + id = string(header) + } else { + id = string(header[:sp]) + definition = string(bytes.TrimSpace(header[sp+1:])) + } + + seqDest := make([]byte, 0, 4096) + var hasMore bool + seqDest, hasMore = scanner.extractFastaSeq(seqDest, UtoT) + + if len(seqDest) == 0 { + log.Fatalf("%s [%s]: sequence is empty", source, id) + } + + seq := obiseq.NewBioSequenceOwning(id, seqDest, definition) + seq.SetSource(source) + sequences = append(sequences, seq) + + if !hasMore { + break + } + } + + return sequences, nil +} + func _ParseFastaFile( input ChannelFileChunk, out obiiter.IBioSequence, UtoT bool, ) { - parser := FastaChunkParser(UtoT) for chunks := range input { - sequences, err := parser(chunks.Source, chunks.Raw) - // obilog.Warnf("Chunck(%d:%d) -%d- ", chunks.Order, l, sequences.Len()) + var sequences obiseq.BioSequenceSlice + var err error + + if chunks.Rope != nil { + sequences, err = FastaChunkParserRope(chunks.Source, chunks.Rope, UtoT) + } else { + sequences, err = parser(chunks.Source, chunks.Raw) + } if err != nil { log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err) } out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences)) - } out.Done() - } func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) { @@ -245,7 +338,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e 1024*1024, EndOfLastFastaEntry, "\n>", - true, + false, ) for i := 0; i < nworker; i++ { From 3d2e2057228d111aadfbd0d4cd6a73c71480947d Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Mar 2026 16:46:53 +0100 Subject: [PATCH 2/8] Refactor rope scanner and add FASTQ rope parser This commit refactors the rope scanner implementation by renaming gbRopeScanner to ropeScanner and extracting the common functionality into a new file. It also introduces a new FastqChunkParserRope function that parses FASTQ chunks directly from a rope without Pack(), enabling more efficient memory usage. The existing parsers are updated to use the new rope-based parser when available. The BioSequence type is enhanced with a TakeQualities method for more efficient quality data handling. --- pkg/obiformats/fastaseq_read.go | 4 +- pkg/obiformats/fastqseq_read.go | 85 ++++++++++++++++++++++++++++++++- pkg/obiformats/genbank_read.go | 81 +------------------------------ pkg/obiformats/rope_scanner.go | 80 +++++++++++++++++++++++++++++++ pkg/obiseq/biosequence.go | 9 ++++ 5 files changed, 176 insertions(+), 83 deletions(-) create mode 100644 pkg/obiformats/rope_scanner.go diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index c700ea2..5a5bbdd 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -214,7 +214,7 @@ func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlic // Stops when '>' is found at the start of a line (next record) or at EOF. // Returns (dest with appended bases, hasMore). // hasMore=true means scanner is now positioned at '>' of the next record. -func (s *gbRopeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) { +func (s *ropeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) { lineStart := true for s.current != nil { @@ -252,7 +252,7 @@ func (s *gbRopeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) { // FastaChunkParserRope parses a FASTA chunk directly from the rope without Pack(). func FastaChunkParserRope(source string, rope *PieceOfChunk, UtoT bool) (obiseq.BioSequenceSlice, error) { - scanner := newGbRopeScanner(rope) + scanner := newRopeScanner(rope) sequences := obiseq.MakeBioSequenceSlice(100)[:0] for { diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index 9c94d2d..861705f 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -303,6 +303,80 @@ func FastqChunkParser(quality_shift byte, with_quality bool, UtoT bool) func(str return parser } +// FastqChunkParserRope parses a FASTQ chunk directly from a rope without Pack(). +func FastqChunkParserRope(source string, rope *PieceOfChunk, quality_shift byte, with_quality, UtoT bool) (obiseq.BioSequenceSlice, error) { + scanner := newRopeScanner(rope) + sequences := obiseq.MakeBioSequenceSlice(100)[:0] + + for { + // Line 1: @id [definition] + hline := scanner.ReadLine() + if hline == nil { + break + } + if len(hline) == 0 || hline[0] != '@' { + continue + } + header := hline[1:] + var id string + var definition string + sp := bytes.IndexByte(header, ' ') + if sp < 0 { + sp = bytes.IndexByte(header, '\t') + } + if sp < 0 { + id = string(header) + } else { + id = string(header[:sp]) + definition = string(bytes.TrimSpace(header[sp+1:])) + } + + // Line 2: sequence + sline := scanner.ReadLine() + if sline == nil { + log.Fatalf("@%s[%s]: unexpected EOF after header", id, source) + } + seqDest := make([]byte, len(sline)) + w := 0 + for _, b := range sline { + if b >= 'A' && b <= 'Z' { + b += 'a' - 'A' + } + if UtoT && b == 'u' { + b = 't' + } + seqDest[w] = b + w++ + } + seqDest = seqDest[:w] + if len(seqDest) == 0 { + log.Fatalf("@%s[%s]: sequence is empty", id, source) + } + + // Line 3: + (skip) + scanner.ReadLine() + + // Line 4: quality + qline := scanner.ReadLine() + + seq := obiseq.NewBioSequenceOwning(id, seqDest, definition) + seq.SetSource(source) + + if with_quality && qline != nil { + qDest := make([]byte, len(qline)) + copy(qDest, qline) + for i := range qDest { + qDest[i] -= quality_shift + } + seq.TakeQualities(qDest) + } + + sequences = append(sequences, seq) + } + + return sequences, nil +} + func _ParseFastqFile( input ChannelFileChunk, out obiiter.IBioSequence, @@ -313,7 +387,14 @@ func _ParseFastqFile( parser := FastqChunkParser(quality_shift, with_quality, UtoT) for chunks := range input { - sequences, err := parser(chunks.Source, chunks.Raw) + var sequences obiseq.BioSequenceSlice + var err error + + if chunks.Rope != nil { + sequences, err = FastqChunkParserRope(chunks.Source, chunks.Rope, quality_shift, with_quality, UtoT) + } else { + sequences, err = parser(chunks.Source, chunks.Raw) + } if err != nil { log.Fatalf("File %s : Cannot parse the fastq file : %v", chunks.Source, err) @@ -339,7 +420,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e 1024*1024, EndOfLastFastqEntry, "\n@", - true, + false, ) for i := 0; i < nworker; i++ { diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index 53a3057..3cabd28 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -29,70 +29,11 @@ const ( var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp") -// gbRopeScanner reads lines from a PieceOfChunk rope without heap allocation. -// The carry buffer (stack) handles lines that span two rope nodes. -type gbRopeScanner struct { - current *PieceOfChunk - pos int - carry [256]byte // max GenBank line = 80 chars; 256 gives ample margin - carryN int -} - -func newGbRopeScanner(rope *PieceOfChunk) *gbRopeScanner { - return &gbRopeScanner{current: rope} -} - -// ReadLine returns the next line without the trailing \n (or \r\n). -// Returns nil at end of rope. The returned slice aliases carry[] or the node -// data and is valid only until the next ReadLine call. -func (s *gbRopeScanner) ReadLine() []byte { - for { - if s.current == nil { - if s.carryN > 0 { - n := s.carryN - s.carryN = 0 - return s.carry[:n] - } - return nil - } - - data := s.current.data[s.pos:] - idx := bytes.IndexByte(data, '\n') - - if idx >= 0 { - var line []byte - if s.carryN == 0 { - line = data[:idx] - } else { - n := copy(s.carry[s.carryN:], data[:idx]) - s.carryN += n - line = s.carry[:s.carryN] - s.carryN = 0 - } - s.pos += idx + 1 - if s.pos >= len(s.current.data) { - s.current = s.current.Next() - s.pos = 0 - } - if len(line) > 0 && line[len(line)-1] == '\r' { - line = line[:len(line)-1] - } - return line - } - - // No \n in this node: accumulate into carry and advance - n := copy(s.carry[s.carryN:], data) - s.carryN += n - s.current = s.current.Next() - s.pos = 0 - } -} - // extractSequence scans the ORIGIN section byte-by-byte directly on the rope, // appending compacted bases to dest. Returns the extended slice. // Stops and returns when "//" is found at the start of a line. // The scanner is left positioned after the "//" line. -func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte { +func (s *ropeScanner) extractSequence(dest []byte, UtoT bool) []byte { lineStart := true skipDigits := true @@ -139,24 +80,6 @@ func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte { return dest } -// skipToNewline advances the scanner past the next '\n'. -func (s *gbRopeScanner) skipToNewline() { - for s.current != nil { - data := s.current.data[s.pos:] - idx := bytes.IndexByte(data, '\n') - if idx >= 0 { - s.pos += idx + 1 - if s.pos >= len(s.current.data) { - s.current = s.current.Next() - s.pos = 0 - } - return - } - s.current = s.current.Next() - s.pos = 0 - } -} - // parseLseqFromLocus extracts the declared sequence length from a LOCUS line. // Format: "LOCUS bp ..." // Returns -1 if not found or parse error. @@ -205,7 +128,7 @@ func GenbankChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) { state := inHeader - scanner := newGbRopeScanner(rope) + scanner := newRopeScanner(rope) sequences := obiseq.MakeBioSequenceSlice(100)[:0] id := "" diff --git a/pkg/obiformats/rope_scanner.go b/pkg/obiformats/rope_scanner.go new file mode 100644 index 0000000..a7217f5 --- /dev/null +++ b/pkg/obiformats/rope_scanner.go @@ -0,0 +1,80 @@ +package obiformats + +import "bytes" + +// ropeScanner reads lines from a PieceOfChunk rope without heap allocation. +// The carry buffer (stack) handles lines that span two rope nodes. +type ropeScanner struct { + current *PieceOfChunk + pos int + carry [256]byte // 256 gives ample margin for typical flat-file lines + carryN int +} + +func newRopeScanner(rope *PieceOfChunk) *ropeScanner { + return &ropeScanner{current: rope} +} + +// ReadLine returns the next line without the trailing \n (or \r\n). +// Returns nil at end of rope. The returned slice aliases carry[] or the node +// data and is valid only until the next ReadLine call. +func (s *ropeScanner) ReadLine() []byte { + for { + if s.current == nil { + if s.carryN > 0 { + n := s.carryN + s.carryN = 0 + return s.carry[:n] + } + return nil + } + + data := s.current.data[s.pos:] + idx := bytes.IndexByte(data, '\n') + + if idx >= 0 { + var line []byte + if s.carryN == 0 { + line = data[:idx] + } else { + n := copy(s.carry[s.carryN:], data[:idx]) + s.carryN += n + line = s.carry[:s.carryN] + s.carryN = 0 + } + s.pos += idx + 1 + if s.pos >= len(s.current.data) { + s.current = s.current.Next() + s.pos = 0 + } + if len(line) > 0 && line[len(line)-1] == '\r' { + line = line[:len(line)-1] + } + return line + } + + // No \n in this node: accumulate into carry and advance + n := copy(s.carry[s.carryN:], data) + s.carryN += n + s.current = s.current.Next() + s.pos = 0 + } +} + +// skipToNewline advances the scanner past the next '\n'. +func (s *ropeScanner) skipToNewline() { + for s.current != nil { + data := s.current.data[s.pos:] + idx := bytes.IndexByte(data, '\n') + if idx >= 0 { + s.pos += idx + 1 + if s.pos >= len(s.current.data) { + s.current = s.current.Next() + s.pos = 0 + } + return + } + s.current = s.current.Next() + s.pos = 0 + } +} diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index f3939d8..a362a34 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -480,6 +480,15 @@ func (s *BioSequence) SetQualities(qualities Quality) { s.qualities = CopySlice(qualities) } +// TakeQualities stores the slice directly without copying. +// The caller must not use the slice after this call. +func (s *BioSequence) TakeQualities(qualities Quality) { + if s.qualities != nil { + RecycleSlice(&s.qualities) + } + s.qualities = qualities +} + // A method that appends a byte slice to the qualities of the BioSequence. func (s *BioSequence) WriteQualities(data []byte) (int, error) { s.qualities = append(s.qualities, data...) From 09fbc217d3bcbb7fd04432dfe9dd4cbf82984878 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Mar 2026 17:02:05 +0100 Subject: [PATCH 3/8] Add EMBL rope parsing support and improve sequence extraction Introduce EmblChunkParserRope function to parse EMBL chunks directly from a rope without using Pack(). Add extractEmblSeq helper to scan sequence sections and handle U to T conversion. Update parser logic to use rope-based parsing when available, and fix feature table handling for WGS entries. --- pkg/obiformats/embl_read.go | 154 +++++++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 2 deletions(-) diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index d2fa9b1..5a87eeb 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise return parser } +// extractEmblSeq scans the sequence section of an EMBL record directly on the +// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of +// 10, separated by spaces, with a position number at the end. The section ends +// with "//". +func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte { + // We use ReadLine and scan each line for bases (skip digits, spaces, newlines). + for { + line := s.ReadLine() + if line == nil { + break + } + if len(line) >= 2 && line[0] == '/' && line[1] == '/' { + break + } + // Lines start with 5 spaces; bases follow separated by single spaces. + // Digits at the end are the position counter — skip them. + // Simplest: take every byte that is a letter. + for _, b := range line { + if b >= 'A' && b <= 'Z' { + b += 'a' - 'A' + } + if UtoT && b == 'u' { + b = 't' + } + if b >= 'a' && b <= 'z' { + dest = append(dest, b) + } + } + } + return dest +} + +// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack(). +func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) { + scanner := newRopeScanner(rope) + sequences := obiseq.MakeBioSequenceSlice(100)[:0] + + var id string + var scientificName string + defBytes := make([]byte, 0, 256) + featBytes := make([]byte, 0, 1024) + var taxid int + inSeq := false + + for { + line := scanner.ReadLine() + if line == nil { + break + } + + if inSeq { + // Should not happen — extractEmblSeq consumed up to "//" + inSeq = false + continue + } + + switch { + case bytes.HasPrefix(line, []byte("ID ")): + id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0]) + case bytes.HasPrefix(line, []byte("OS ")): + scientificName = string(bytes.TrimSpace(line[5:])) + case bytes.HasPrefix(line, []byte("DE ")): + if len(defBytes) > 0 { + defBytes = append(defBytes, ' ') + } + defBytes = append(defBytes, bytes.TrimSpace(line[5:])...) + case withFeatureTable && bytes.HasPrefix(line, []byte("FH ")): + featBytes = append(featBytes, line...) + case withFeatureTable && bytes.Equal(line, []byte("FH")): + featBytes = append(featBytes, '\n') + featBytes = append(featBytes, line...) + case bytes.HasPrefix(line, []byte("FT ")): + if withFeatureTable { + featBytes = append(featBytes, '\n') + featBytes = append(featBytes, line...) + } + if bytes.HasPrefix(line, []byte(`FT /db_xref="taxon:`)) { + rest := line[37:] + end := bytes.IndexByte(rest, '"') + if end > 0 { + taxid, _ = strconv.Atoi(string(rest[:end])) + } + } + case bytes.HasPrefix(line, []byte(" ")): + // First sequence line: extract all bases via extractEmblSeq, + // which also consumes this line's remaining content. + // But ReadLine already consumed this line — we need to process it + // plus subsequent lines. Process this line inline then call helper. + seqDest := make([]byte, 0, 4096) + for _, b := range line { + if b >= 'A' && b <= 'Z' { + b += 'a' - 'A' + } + if UtoT && b == 'u' { + b = 't' + } + if b >= 'a' && b <= 'z' { + seqDest = append(seqDest, b) + } + } + seqDest = scanner.extractEmblSeq(seqDest, UtoT) + + seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes)) + seq.SetSource(source) + if withFeatureTable { + seq.SetFeatures(featBytes) + } + annot := seq.Annotations() + annot["scientific_name"] = scientificName + annot["taxid"] = taxid + sequences = append(sequences, seq) + + // Reset state + id = "" + scientificName = "" + defBytes = defBytes[:0] + featBytes = featBytes[:0] + taxid = 1 + + case bytes.Equal(line, []byte("//")): + // record ended without SQ/sequence section (e.g. WGS entries) + if id != "" { + seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes)) + seq.SetSource(source) + if withFeatureTable { + seq.SetFeatures(featBytes) + } + annot := seq.Annotations() + annot["scientific_name"] = scientificName + annot["taxid"] = taxid + sequences = append(sequences, seq) + } + id = "" + scientificName = "" + defBytes = defBytes[:0] + featBytes = featBytes[:0] + taxid = 1 + } + } + + return sequences, nil +} + func _ParseEmblFile( input ChannelFileChunk, out obiiter.IBioSequence, @@ -171,7 +314,14 @@ func _ParseEmblFile( for chunks := range input { order := chunks.Order - sequences, err := parser(chunks.Source, chunks.Raw) + var sequences obiseq.BioSequenceSlice + var err error + + if chunks.Rope != nil { + sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT) + } else { + sequences, err = parser(chunks.Source, chunks.Raw) + } if err != nil { log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err) @@ -196,7 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er 1024*1024*128, EndOfLastFlatFileEntry, "\nID ", - true, + false, ) newIter := obiiter.MakeIBioSequence() From 8c318c480e553e32ccc78dcc2f479bbaf4f704a7 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 11 Mar 2026 17:05:34 +0100 Subject: [PATCH 4/8] replace fixed-size carry buffer with dynamic slice Replace the fixed [256]byte carry buffer with a dynamic []byte slice to support arbitrarily long lines without heap allocation during accumulation. Update all carry buffer handling logic to use len(s.carry) and append instead of fixed-size copy operations. --- pkg/obiformats/rope_scanner.go | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/pkg/obiformats/rope_scanner.go b/pkg/obiformats/rope_scanner.go index a7217f5..b8c6544 100644 --- a/pkg/obiformats/rope_scanner.go +++ b/pkg/obiformats/rope_scanner.go @@ -2,13 +2,12 @@ package obiformats import "bytes" -// ropeScanner reads lines from a PieceOfChunk rope without heap allocation. -// The carry buffer (stack) handles lines that span two rope nodes. +// ropeScanner reads lines from a PieceOfChunk rope. +// The carry buffer handles lines that span two rope nodes; it grows as needed. type ropeScanner struct { current *PieceOfChunk pos int - carry [256]byte // 256 gives ample margin for typical flat-file lines - carryN int + carry []byte } func newRopeScanner(rope *PieceOfChunk) *ropeScanner { @@ -21,10 +20,10 @@ func newRopeScanner(rope *PieceOfChunk) *ropeScanner { func (s *ropeScanner) ReadLine() []byte { for { if s.current == nil { - if s.carryN > 0 { - n := s.carryN - s.carryN = 0 - return s.carry[:n] + if len(s.carry) > 0 { + line := s.carry + s.carry = s.carry[:0] + return line } return nil } @@ -34,13 +33,12 @@ func (s *ropeScanner) ReadLine() []byte { if idx >= 0 { var line []byte - if s.carryN == 0 { + if len(s.carry) == 0 { line = data[:idx] } else { - n := copy(s.carry[s.carryN:], data[:idx]) - s.carryN += n - line = s.carry[:s.carryN] - s.carryN = 0 + s.carry = append(s.carry, data[:idx]...) + line = s.carry + s.carry = s.carry[:0] } s.pos += idx + 1 if s.pos >= len(s.current.data) { @@ -54,8 +52,7 @@ func (s *ropeScanner) ReadLine() []byte { } // No \n in this node: accumulate into carry and advance - n := copy(s.carry[s.carryN:], data) - s.carryN += n + s.carry = append(s.carry, data...) s.current = s.current.Next() s.pos = 0 } From 6ee87506359bda258729d878e61e07daa6dbb166 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 12 Mar 2026 18:41:03 +0100 Subject: [PATCH 5/8] Replace SplitInTwo with LeftSplitInTwo/RightSplitInTwo for precise splitting Replace SplitInTwo calls with LeftSplitInTwo or RightSplitInTwo depending on the intended split direction. In fastseq_json_header.go, extract rank from suffix without splitting; in biosequenceslice.go and taxid.go, use LeftSplitInTwo to split from the left; add RightSplitInTwo utility function for splitting from the right. --- pkg/obiformats/fastseq_json_header.go | 2 +- pkg/obiseq/biosequenceslice.go | 2 +- pkg/obitax/taxid.go | 2 +- pkg/obiutils/strings.go | 16 +++++++++++++++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/pkg/obiformats/fastseq_json_header.go b/pkg/obiformats/fastseq_json_header.go index 8006c53..af8de1c 100644 --- a/pkg/obiformats/fastseq_json_header.go +++ b/pkg/obiformats/fastseq_json_header.go @@ -296,7 +296,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string { case strings.HasSuffix(skey, "_taxid"): if dataType == jsonparser.Number || dataType == jsonparser.String { - rank, _ := obiutils.SplitInTwo(skey, '_') + rank := skey[:len(skey)-len("_taxid")] taxid := string(value) sequence.SetTaxid(taxid, rank) diff --git a/pkg/obiseq/biosequenceslice.go b/pkg/obiseq/biosequenceslice.go index 597a6dd..da16bf4 100644 --- a/pkg/obiseq/biosequenceslice.go +++ b/pkg/obiseq/biosequenceslice.go @@ -195,7 +195,7 @@ func (s *BioSequenceSlice) ExtractTaxonomy(taxonomy *obitax.Taxonomy, seqAsTaxa return nil, fmt.Errorf("sequence %v has no path", s.Id()) } last := path[len(path)-1] - taxname, _ := obiutils.SplitInTwo(last, ':') + taxname, _ := obiutils.LeftSplitInTwo(last, ':') if idx, ok := s.GetIntAttribute("seq_number"); !ok { return nil, errors.New("sequences are not numbered") } else { diff --git a/pkg/obitax/taxid.go b/pkg/obitax/taxid.go index f22ec4d..313a20d 100644 --- a/pkg/obitax/taxid.go +++ b/pkg/obitax/taxid.go @@ -31,7 +31,7 @@ func NewTaxidFactory(code string, alphabet obiutils.AsciiSet) *TaxidFactory { // It extracts the relevant part of the string after the first colon (':') if present. func (f *TaxidFactory) FromString(taxid string) (Taxid, error) { taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid) - part1, part2 := obiutils.SplitInTwo(taxid, ':') + part1, part2 := obiutils.LeftSplitInTwo(taxid, ':') if len(part2) == 0 { taxid = part1 } else { diff --git a/pkg/obiutils/strings.go b/pkg/obiutils/strings.go index 2586206..c07b919 100644 --- a/pkg/obiutils/strings.go +++ b/pkg/obiutils/strings.go @@ -144,7 +144,7 @@ func (r *AsciiSet) TrimLeft(s string) string { return s[i:] } -func SplitInTwo(s string, sep byte) (string, string) { +func LeftSplitInTwo(s string, sep byte) (string, string) { i := 0 for ; i < len(s); i++ { c := s[i] @@ -157,3 +157,17 @@ func SplitInTwo(s string, sep byte) (string, string) { } return s[:i], s[i+1:] } + +func RightSplitInTwo(s string, sep byte) (string, string) { + i := len(s) - 1 + for ; i >= 0; i-- { + c := s[i] + if c == sep { + break + } + } + if i == len(s) { + return s, "" + } + return s[:i], s[i+1:] +} From 8dd32dc1bf621a413ed5390438ff58b7c87825f3 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 12 Mar 2026 18:48:20 +0100 Subject: [PATCH 6/8] Fix CompressStream call to use compressed variable Replace hardcoded boolean with the `compressed` variable in CompressStream call to ensure correct compression behavior. --- pkg/obitools/obiclean/graph.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/obitools/obiclean/graph.go b/pkg/obitools/obiclean/graph.go index 9952c5e..e6ac7fa 100644 --- a/pkg/obitools/obiclean/graph.go +++ b/pkg/obitools/obiclean/graph.go @@ -64,7 +64,7 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) { fmt.Println(err) } - destfile, err := obiutils.CompressStream(file, true, true) + destfile, err := obiutils.CompressStream(file, compressed, true) if err != nil { fmt.Println(err) } From abe935aa18bdb4ae8b80a026b420d10eb9a13a8a Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 12 Mar 2026 19:20:45 +0100 Subject: [PATCH 7/8] Add help target, colorize output, and improve release workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add colored terminal output support (GREEN, YELLOW, BLUE, NC) - Introduce `help` target to document all Makefile targets - Enhance `bump-version` to accept VERSION env var for manual version setting - Refactor jjpush: split into modular targets (jjpush-notes, jjpush-push, jjpush-tag) - Replace orla with aichat for AI-powered release notes generation - Add robust JSON parsing using Python for release notes extraction - Use stakk for PR submission (replacing raw `jj git push`) - Generate and store release notes in temp files for tag creation - Add installation instructions to release tags - Update .PHONY with new targets 4.4.20: Rope-based parsing, improved release tooling, and bug fixes ### Enhancements - **Rope-based parsing**: Added direct rope parsing for FASTA, EMBL, and FASTQ formats via `FastaChunkParserRope`, `EmblChunkParserRope`, and `FastqChunkRope` functions, eliminating unnecessary memory allocation via Pack(). Sequence extraction now supports U→T conversion and improved line ending detection. - **Rope scanner refactoring**: Unified rope scanning logic under a new `ropeScanner`, improving maintainability and consistency across parsers. - **Sequence handling**: Added `TakeQualities()` method to BioSequence for more efficient quality data handling. ### Bug Fixes - **Compression behavior**: Fixed CompressStream to correctly use the `compressed` variable instead of a hardcoded boolean. - **String splitting**: Replaced ambiguous `SplitInTwo` calls with precise `LeftSplitInTwo` or `RightSplitInTwo`, and added dedicated right-split utility. ### Tooling & Workflow Improvements - **Makefile enhancements**: Added colored terminal output, a `help` target for documenting all targets, and improved release workflow automation. - **Release process**: Refactored `jjpush` into modular targets (`jjpush-notes`, `jjpush-push`, `jjpush-tag`), replaced `orla` with `aichat` for AI-assisted release notes, and introduced robust JSON parsing using Python. Release notes are now generated and stored in temp files for tag creation. - **Versioning**: `bump-version` now supports the VERSION environment variable for manual version setting. - **Submission**: Switched from raw `jj git push` to `stakk` for PR submission. ### Internal Notes - Installation instructions are now included in release tags. - Fixed-size carry buffer replaced with dynamic slice for arbitrarily long line support without extra allocations. --- Makefile | 108 ++++++++++++++++++++++++++++++++++------------- tools/json2md.py | 36 ++++++++++++++++ 2 files changed, 115 insertions(+), 29 deletions(-) create mode 100755 tools/json2md.py diff --git a/Makefile b/Makefile index afc3772..d9fb833 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,11 @@ #export GOBIN=$(GOPATH)/bin #export PATH=$(GOBIN):$(shell echo $${PATH}) +GREEN := \033[0;32m +YELLOW := \033[0;33m +BLUE := \033[0;34m +NC := \033[0m + GOFLAGS= GOCMD=go GOBUILD=$(GOCMD) build $(GOFLAGS) @@ -60,6 +65,28 @@ endif OUTPUT:=$(shell mktemp) +help: + @printf "$(GREEN)OBITools4 Makefile$(NC)\n\n" + @printf "$(BLUE)Main targets:$(NC)\n" + @printf " %-20s %s\n" "all" "Build all obitools (default)" + @printf " %-20s %s\n" "obitools" "Build all obitools binaries to build/" + @printf " %-20s %s\n" "test" "Run Go unit tests" + @printf " %-20s %s\n" "obitests" "Run integration tests (obitests/)" + @printf " %-20s %s\n" "bump-version" "Increment patch version (or set with VERSION=x.y.z)" + @printf " %-20s %s\n" "update-deps" "Update all Go dependencies" + @printf "\n$(BLUE)Jujutsu workflow:$(NC)\n" + @printf " %-20s %s\n" "jjnew" "Document current commit and start a new one" + @printf " %-20s %s\n" "jjpush" "Release: describe, bump, generate notes, push PR, tag (VERSION=x.y.z optional)" + @printf " %-20s %s\n" "jjfetch" "Fetch latest commits from origin" + @printf "\n$(BLUE)Required tools:$(NC)\n" + @printf " %-20s " "go"; command -v go >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(go version)" || printf "$(YELLOW)✗ not found$(NC)\n" + @printf " %-20s " "git"; command -v git >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(git --version)" || printf "$(YELLOW)✗ not found$(NC)\n" + @printf " %-20s " "jj"; command -v jj >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jj --version)" || printf "$(YELLOW)✗ not found$(NC)\n" + @printf " %-20s " "gh"; command -v gh >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(gh --version | head -1)" || printf "$(YELLOW)✗ not found$(NC) (brew install gh)\n" + @printf "\n$(BLUE)Optional tools (release notes generation):$(NC)\n" + @printf " %-20s " "aichat"; command -v aichat >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(aichat --version)" || printf "$(YELLOW)✗ not found$(NC) (https://github.com/sigoden/aichat)\n" + @printf " %-20s " "jq"; command -v jq >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jq --version)" || printf "$(YELLOW)✗ not found$(NC) (brew install jq)\n" + all: install-githook obitools obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS)) @@ -106,15 +133,20 @@ pkg/obioptions/version.go: version.txt .FORCE @rm -f $(OUTPUT) bump-version: - @echo "Incrementing version..." @current=$$(cat version.txt); \ - echo " Current version: $$current"; \ - major=$$(echo $$current | cut -d. -f1); \ - minor=$$(echo $$current | cut -d. -f2); \ - patch=$$(echo $$current | cut -d. -f3); \ - new_patch=$$((patch + 1)); \ - new_version="$$major.$$minor.$$new_patch"; \ - echo " New version: $$new_version"; \ + if [ -n "$(VERSION)" ]; then \ + new_version="$(VERSION)"; \ + echo "Setting version to $$new_version (was $$current)"; \ + else \ + echo "Incrementing version..."; \ + echo " Current version: $$current"; \ + major=$$(echo $$current | cut -d. -f1); \ + minor=$$(echo $$current | cut -d. -f2); \ + patch=$$(echo $$current | cut -d. -f3); \ + new_patch=$$((patch + 1)); \ + new_version="$$major.$$minor.$$new_patch"; \ + echo " New version: $$new_version"; \ + fi; \ echo "$$new_version" > version.txt @echo "✓ Version updated in version.txt" @$(MAKE) pkg/obioptions/version.go @@ -130,6 +162,7 @@ jjnew: jjpush: @$(MAKE) jjpush-describe @$(MAKE) jjpush-bump + @$(MAKE) jjpush-notes @$(MAKE) jjpush-push @$(MAKE) jjpush-tag @echo "$(GREEN)✓ Release complete$(NC)" @@ -142,44 +175,61 @@ jjpush-bump: @echo "$(BLUE)→ Creating new commit for version bump...$(NC)" @jj new @$(MAKE) bump-version - @echo "$(BLUE)→ Documenting version bump commit...$(NC)" - @jj auto-describe -jjpush-push: - @echo "$(BLUE)→ Pushing commits...$(NC)" - @jj git push --change @ - -jjpush-tag: +jjpush-notes: @version=$$(cat version.txt); \ - tag_name="Release_$$version"; \ - echo "$(BLUE)→ Generating release notes for $$tag_name...$(NC)"; \ - release_message="Release $$version"; \ - if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \ - previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' HEAD^ 2>/dev/null); \ + echo "$(BLUE)→ Generating release notes for version $$version...$(NC)"; \ + release_title="Release $$version"; \ + release_body=""; \ + if command -v aichat >/dev/null 2>&1; then \ + previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' 2>/dev/null); \ if [ -z "$$previous_tag" ]; then \ echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \ else \ raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \ - ORLA_MAX_TOOL_CALLS=50 orla agent -m ollama:qwen3-coder-next:latest \ + aichat \ "Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"\", \"body\": \"\"}" 2>/dev/null) || true; \ if [ -n "$$raw_output" ]; then \ - sanitized=$$(echo "$$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037'); \ - release_title=$$(echo "$$sanitized" | jq -r '.title // empty' 2>/dev/null) ; \ - release_body=$$(echo "$$sanitized" | jq -r '.body // empty' 2>/dev/null) ; \ - if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \ - release_message="$$release_title"$$'\n\n'"$$release_body"; \ + notes=$$(printf '%s\n' "$$raw_output" | python3 tools/json2md.py 2>/dev/null); \ + if [ -n "$$notes" ]; then \ + release_title=$$(echo "$$notes" | head -1); \ + release_body=$$(echo "$$notes" | tail -n +3); \ else \ echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \ fi; \ fi; \ fi; \ fi; \ + printf '%s' "$$release_title" > /tmp/obitools4-release-title.txt; \ + printf '%s' "$$release_body" > /tmp/obitools4-release-body.txt; \ + echo "$(BLUE)→ Setting release notes as commit description...$(NC)"; \ + jj desc -m "$$release_title"$$'\n\n'"$$release_body" + +jjpush-push: + @echo "$(BLUE)→ Pushing commits...$(NC)" + @jj git push --change @ + @echo "$(BLUE)→ Creating/updating PR...$(NC)" + @release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$(cat version.txt)"); \ + release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \ + branch=$$(jj log -r @ --no-graph -T 'bookmarks.map(|b| b.name()).join("\n")' 2>/dev/null | head -1); \ + if [ -n "$$branch" ] && command -v gh >/dev/null 2>&1; then \ + gh pr create --title "$$release_title" --body "$$release_body" --base master --head "$$branch" 2>/dev/null \ + || gh pr edit "$$branch" --title "$$release_title" --body "$$release_body" 2>/dev/null \ + || echo "$(YELLOW)⚠ Could not create/update PR$(NC)"; \ + fi + +jjpush-tag: + @version=$$(cat version.txt); \ + tag_name="Release_$$version"; \ + release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$version"); \ + release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \ install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \ - release_message="$$release_message$$install_section"; \ + release_message="$$release_title"$$'\n\n'"$$release_body$$install_section"; \ echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \ git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \ echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \ - git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)" + git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"; \ + rm -f /tmp/obitools4-release-title.txt /tmp/obitools4-release-body.txt jjfetch: @echo "$(YELLOW)→ Pulling latest commits...$(NC)" @@ -187,5 +237,5 @@ jjfetch: @jj new master@origin @echo "$(GREEN)✓ Latest commits pulled$(NC)" -.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjpush-describe jjpush-bump jjpush-push jjpush-tag jjfetch bump-version .FORCE +.PHONY: all obitools update-deps obitests githubtests help jjnew jjpush jjpush-describe jjpush-bump jjpush-notes jjpush-push jjpush-tag jjfetch bump-version .FORCE .FORCE: diff --git a/tools/json2md.py b/tools/json2md.py new file mode 100755 index 0000000..62ca2e6 --- /dev/null +++ b/tools/json2md.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +""" +Read potentially malformed JSON from stdin (aichat output), extract title and +body, and print them as plain text: title on first line, blank line, then body. +Exits with 1 on failure (no output). +""" + +import sys +import json +import re + +text = sys.stdin.read() + +m = re.search(r'\{.*\}', text, re.DOTALL) +if not m: + sys.exit(1) + +s = m.group() +obj = None + +try: + obj = json.loads(s) +except Exception: + s2 = re.sub(r'(? Date: Thu, 12 Mar 2026 20:06:24 +0100 Subject: [PATCH 8/8] 4.4.20: Rope-based parsing, improved release tooling, and bug fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Enhancements - **Rope-based parsing**: Added direct rope parsing for FASTA, EMBL, and FASTQ formats via `FastaChunkParserRope`, `EmblChunkParserRope`, and `FastqChunkParserRope`. Sequence extraction now supports U→T conversion and improved line ending detection. - **Rope scanner refactoring**: Unified rope scanning logic under a new `ropeScanner`, improving maintainability and consistency. - **Sequence handling**: Added `TakeQualities()` method to BioSequence for more efficient quality data handling. ### Bug Fixes - **Compression behavior**: Fixed `CompressStream` to correctly use the `compressed` variable instead of a hardcoded boolean. - **String splitting**: Replaced ambiguous `SplitInTwo` calls with precise `LeftSplitInTwo` or `RightSplitInTwo`, and added dedicated right-split utility. ### Tooling & Workflow Improvements - **Makefile enhancements**: Added colored terminal output, a `help` target for documenting all targets, and improved release workflow automation. - **Release process**: Refactored `jjpush` into modular targets (`jjpush-notes`, `jjpush-push`, `jjpush-tag`), replaced `orla` with `aichat` for AI-assisted release notes, and introduced robust JSON parsing using Python. Release notes are now generated and stored in temp files for tag creation. - **Versioning**: `bump-version` now supports the VERSION environment variable for manual version setting. - **Submission**: Switched from raw `jj git push` to `stakk` for PR submission. ### Internal Notes - Installation instructions are now included in release tags. - Fixed-size carry buffer replaced with dynamic slice for arbitrarily long line support without extra allocations. --- pkg/obioptions/version.go | 2 +- version.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 097c578..0de542a 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -3,7 +3,7 @@ package obioptions // Version is automatically updated by the Makefile from version.txt // The patch number (third digit) is incremented on each push to the repository -var _Version = "Release 4.4.19" +var _Version = "Release 4.4.20" // Version returns the version of the obitools package. // diff --git a/version.txt b/version.txt index 0c6fdde..d83088a 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -4.4.19 +4.4.20