mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
Compare commits
9 Commits
push-uzpqq
...
push-uotrs
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cd0c525f50 | ||
|
|
abe935aa18 | ||
|
|
8dd32dc1bf | ||
|
|
6ee8750635 | ||
|
|
8c318c480e | ||
|
|
09fbc217d3 | ||
|
|
3d2e205722 | ||
|
|
623116ab13 | ||
|
|
1e4509cb63 |
108
Makefile
108
Makefile
@@ -2,6 +2,11 @@
|
||||
#export GOBIN=$(GOPATH)/bin
|
||||
#export PATH=$(GOBIN):$(shell echo $${PATH})
|
||||
|
||||
GREEN := \033[0;32m
|
||||
YELLOW := \033[0;33m
|
||||
BLUE := \033[0;34m
|
||||
NC := \033[0m
|
||||
|
||||
GOFLAGS=
|
||||
GOCMD=go
|
||||
GOBUILD=$(GOCMD) build $(GOFLAGS)
|
||||
@@ -60,6 +65,28 @@ endif
|
||||
|
||||
OUTPUT:=$(shell mktemp)
|
||||
|
||||
help:
|
||||
@printf "$(GREEN)OBITools4 Makefile$(NC)\n\n"
|
||||
@printf "$(BLUE)Main targets:$(NC)\n"
|
||||
@printf " %-20s %s\n" "all" "Build all obitools (default)"
|
||||
@printf " %-20s %s\n" "obitools" "Build all obitools binaries to build/"
|
||||
@printf " %-20s %s\n" "test" "Run Go unit tests"
|
||||
@printf " %-20s %s\n" "obitests" "Run integration tests (obitests/)"
|
||||
@printf " %-20s %s\n" "bump-version" "Increment patch version (or set with VERSION=x.y.z)"
|
||||
@printf " %-20s %s\n" "update-deps" "Update all Go dependencies"
|
||||
@printf "\n$(BLUE)Jujutsu workflow:$(NC)\n"
|
||||
@printf " %-20s %s\n" "jjnew" "Document current commit and start a new one"
|
||||
@printf " %-20s %s\n" "jjpush" "Release: describe, bump, generate notes, push PR, tag (VERSION=x.y.z optional)"
|
||||
@printf " %-20s %s\n" "jjfetch" "Fetch latest commits from origin"
|
||||
@printf "\n$(BLUE)Required tools:$(NC)\n"
|
||||
@printf " %-20s " "go"; command -v go >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(go version)" || printf "$(YELLOW)✗ not found$(NC)\n"
|
||||
@printf " %-20s " "git"; command -v git >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(git --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
|
||||
@printf " %-20s " "jj"; command -v jj >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jj --version)" || printf "$(YELLOW)✗ not found$(NC)\n"
|
||||
@printf " %-20s " "gh"; command -v gh >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(gh --version | head -1)" || printf "$(YELLOW)✗ not found$(NC) (brew install gh)\n"
|
||||
@printf "\n$(BLUE)Optional tools (release notes generation):$(NC)\n"
|
||||
@printf " %-20s " "aichat"; command -v aichat >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(aichat --version)" || printf "$(YELLOW)✗ not found$(NC) (https://github.com/sigoden/aichat)\n"
|
||||
@printf " %-20s " "jq"; command -v jq >/dev/null 2>&1 && printf "$(GREEN)✓$(NC) %s\n" "$$(jq --version)" || printf "$(YELLOW)✗ not found$(NC) (brew install jq)\n"
|
||||
|
||||
all: install-githook obitools
|
||||
|
||||
obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS))
|
||||
@@ -106,15 +133,20 @@ pkg/obioptions/version.go: version.txt .FORCE
|
||||
@rm -f $(OUTPUT)
|
||||
|
||||
bump-version:
|
||||
@echo "Incrementing version..."
|
||||
@current=$$(cat version.txt); \
|
||||
echo " Current version: $$current"; \
|
||||
major=$$(echo $$current | cut -d. -f1); \
|
||||
minor=$$(echo $$current | cut -d. -f2); \
|
||||
patch=$$(echo $$current | cut -d. -f3); \
|
||||
new_patch=$$((patch + 1)); \
|
||||
new_version="$$major.$$minor.$$new_patch"; \
|
||||
echo " New version: $$new_version"; \
|
||||
if [ -n "$(VERSION)" ]; then \
|
||||
new_version="$(VERSION)"; \
|
||||
echo "Setting version to $$new_version (was $$current)"; \
|
||||
else \
|
||||
echo "Incrementing version..."; \
|
||||
echo " Current version: $$current"; \
|
||||
major=$$(echo $$current | cut -d. -f1); \
|
||||
minor=$$(echo $$current | cut -d. -f2); \
|
||||
patch=$$(echo $$current | cut -d. -f3); \
|
||||
new_patch=$$((patch + 1)); \
|
||||
new_version="$$major.$$minor.$$new_patch"; \
|
||||
echo " New version: $$new_version"; \
|
||||
fi; \
|
||||
echo "$$new_version" > version.txt
|
||||
@echo "✓ Version updated in version.txt"
|
||||
@$(MAKE) pkg/obioptions/version.go
|
||||
@@ -130,6 +162,7 @@ jjnew:
|
||||
jjpush:
|
||||
@$(MAKE) jjpush-describe
|
||||
@$(MAKE) jjpush-bump
|
||||
@$(MAKE) jjpush-notes
|
||||
@$(MAKE) jjpush-push
|
||||
@$(MAKE) jjpush-tag
|
||||
@echo "$(GREEN)✓ Release complete$(NC)"
|
||||
@@ -142,44 +175,61 @@ jjpush-bump:
|
||||
@echo "$(BLUE)→ Creating new commit for version bump...$(NC)"
|
||||
@jj new
|
||||
@$(MAKE) bump-version
|
||||
@echo "$(BLUE)→ Documenting version bump commit...$(NC)"
|
||||
@jj auto-describe
|
||||
|
||||
jjpush-push:
|
||||
@echo "$(BLUE)→ Pushing commits...$(NC)"
|
||||
@jj git push --change @
|
||||
|
||||
jjpush-tag:
|
||||
jjpush-notes:
|
||||
@version=$$(cat version.txt); \
|
||||
tag_name="Release_$$version"; \
|
||||
echo "$(BLUE)→ Generating release notes for $$tag_name...$(NC)"; \
|
||||
release_message="Release $$version"; \
|
||||
if command -v orla >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then \
|
||||
previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' HEAD^ 2>/dev/null); \
|
||||
echo "$(BLUE)→ Generating release notes for version $$version...$(NC)"; \
|
||||
release_title="Release $$version"; \
|
||||
release_body=""; \
|
||||
if command -v aichat >/dev/null 2>&1; then \
|
||||
previous_tag=$$(git describe --tags --abbrev=0 --match 'Release_*' 2>/dev/null); \
|
||||
if [ -z "$$previous_tag" ]; then \
|
||||
echo "$(YELLOW)⚠ No previous Release tag found, skipping release notes$(NC)"; \
|
||||
else \
|
||||
raw_output=$$(git log --format="%h %B" "$$previous_tag..HEAD" | \
|
||||
ORLA_MAX_TOOL_CALLS=50 orla agent -m ollama:qwen3-coder-next:latest \
|
||||
aichat \
|
||||
"Summarize the following commits into a GitHub release note for version $$version. Ignore commits related to version bumps, .gitignore changes, or any internal housekeeping that is irrelevant to end users. Describe each user-facing change precisely without exposing code. Eliminate redundancy. Output strictly valid JSON with no surrounding text, using this exact schema: {\"title\": \"<short release title>\", \"body\": \"<detailed markdown release notes>\"}" 2>/dev/null) || true; \
|
||||
if [ -n "$$raw_output" ]; then \
|
||||
sanitized=$$(echo "$$raw_output" | sed -n '/^{/,/^}/p' | tr -d '\000-\011\013-\014\016-\037'); \
|
||||
release_title=$$(echo "$$sanitized" | jq -r '.title // empty' 2>/dev/null) ; \
|
||||
release_body=$$(echo "$$sanitized" | jq -r '.body // empty' 2>/dev/null) ; \
|
||||
if [ -n "$$release_title" ] && [ -n "$$release_body" ]; then \
|
||||
release_message="$$release_title"$$'\n\n'"$$release_body"; \
|
||||
notes=$$(printf '%s\n' "$$raw_output" | python3 tools/json2md.py 2>/dev/null); \
|
||||
if [ -n "$$notes" ]; then \
|
||||
release_title=$$(echo "$$notes" | head -1); \
|
||||
release_body=$$(echo "$$notes" | tail -n +3); \
|
||||
else \
|
||||
echo "$(YELLOW)⚠ JSON parsing failed, using default release message$(NC)"; \
|
||||
fi; \
|
||||
fi; \
|
||||
fi; \
|
||||
fi; \
|
||||
printf '%s' "$$release_title" > /tmp/obitools4-release-title.txt; \
|
||||
printf '%s' "$$release_body" > /tmp/obitools4-release-body.txt; \
|
||||
echo "$(BLUE)→ Setting release notes as commit description...$(NC)"; \
|
||||
jj desc -m "$$release_title"$$'\n\n'"$$release_body"
|
||||
|
||||
jjpush-push:
|
||||
@echo "$(BLUE)→ Pushing commits...$(NC)"
|
||||
@jj git push --change @
|
||||
@echo "$(BLUE)→ Creating/updating PR...$(NC)"
|
||||
@release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$(cat version.txt)"); \
|
||||
release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
|
||||
branch=$$(jj log -r @ --no-graph -T 'bookmarks.map(|b| b.name()).join("\n")' 2>/dev/null | head -1); \
|
||||
if [ -n "$$branch" ] && command -v gh >/dev/null 2>&1; then \
|
||||
gh pr create --title "$$release_title" --body "$$release_body" --base master --head "$$branch" 2>/dev/null \
|
||||
|| gh pr edit "$$branch" --title "$$release_title" --body "$$release_body" 2>/dev/null \
|
||||
|| echo "$(YELLOW)⚠ Could not create/update PR$(NC)"; \
|
||||
fi
|
||||
|
||||
jjpush-tag:
|
||||
@version=$$(cat version.txt); \
|
||||
tag_name="Release_$$version"; \
|
||||
release_title=$$(cat /tmp/obitools4-release-title.txt 2>/dev/null || echo "Release $$version"); \
|
||||
release_body=$$(cat /tmp/obitools4-release-body.txt 2>/dev/null || echo ""); \
|
||||
install_section=$$'\n## Installation\n\n### Pre-built binaries\n\nDownload the appropriate archive for your system from the\n[release assets](https://github.com/metabarcoding/obitools4/releases/tag/Release_'"$$version"')\nand extract it:\n\n#### Linux (AMD64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_amd64.tar.gz\n```\n\n#### Linux (ARM64)\n```bash\ntar -xzf obitools4_'"$$version"'_linux_arm64.tar.gz\n```\n\n#### macOS (Intel)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_amd64.tar.gz\n```\n\n#### macOS (Apple Silicon)\n```bash\ntar -xzf obitools4_'"$$version"'_darwin_arm64.tar.gz\n```\n\nAll OBITools4 binaries are included in each archive.\n\n### From source\n\nYou can also compile and install OBITools4 directly from source using the\ninstallation script:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | bash -s -- --version '"$$version"'\n```\n\nBy default binaries are installed in `/usr/local/bin`. Use `--install-dir` to\nchange the destination and `--obitools-prefix` to add a prefix to command names:\n\n```bash\ncurl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh | \\\n bash -s -- --version '"$$version"' --install-dir ~/local --obitools-prefix k\n```\n'; \
|
||||
release_message="$$release_message$$install_section"; \
|
||||
release_message="$$release_title"$$'\n\n'"$$release_body$$install_section"; \
|
||||
echo "$(BLUE)→ Creating tag $$tag_name...$(NC)"; \
|
||||
git tag -a "$$tag_name" -m "$$release_message" 2>/dev/null || echo "$(YELLOW)⚠ Tag $$tag_name already exists$(NC)"; \
|
||||
echo "$(BLUE)→ Pushing tag $$tag_name...$(NC)"; \
|
||||
git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"
|
||||
git push origin "$$tag_name" 2>/dev/null || echo "$(YELLOW)⚠ Tag push failed or already pushed$(NC)"; \
|
||||
rm -f /tmp/obitools4-release-title.txt /tmp/obitools4-release-body.txt
|
||||
|
||||
jjfetch:
|
||||
@echo "$(YELLOW)→ Pulling latest commits...$(NC)"
|
||||
@@ -187,5 +237,5 @@ jjfetch:
|
||||
@jj new master@origin
|
||||
@echo "$(GREEN)✓ Latest commits pulled$(NC)"
|
||||
|
||||
.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjpush-describe jjpush-bump jjpush-push jjpush-tag jjfetch bump-version .FORCE
|
||||
.PHONY: all obitools update-deps obitests githubtests help jjnew jjpush jjpush-describe jjpush-bump jjpush-notes jjpush-push jjpush-tag jjfetch bump-version .FORCE
|
||||
.FORCE:
|
||||
|
||||
@@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise
|
||||
return parser
|
||||
}
|
||||
|
||||
// extractEmblSeq scans the sequence section of an EMBL record directly on the
|
||||
// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of
|
||||
// 10, separated by spaces, with a position number at the end. The section ends
|
||||
// with "//".
|
||||
func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {
|
||||
// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).
|
||||
for {
|
||||
line := s.ReadLine()
|
||||
if line == nil {
|
||||
break
|
||||
}
|
||||
if len(line) >= 2 && line[0] == '/' && line[1] == '/' {
|
||||
break
|
||||
}
|
||||
// Lines start with 5 spaces; bases follow separated by single spaces.
|
||||
// Digits at the end are the position counter — skip them.
|
||||
// Simplest: take every byte that is a letter.
|
||||
for _, b := range line {
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 'a' - 'A'
|
||||
}
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
if b >= 'a' && b <= 'z' {
|
||||
dest = append(dest, b)
|
||||
}
|
||||
}
|
||||
}
|
||||
return dest
|
||||
}
|
||||
|
||||
// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().
|
||||
func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
scanner := newRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
var id string
|
||||
var scientificName string
|
||||
defBytes := make([]byte, 0, 256)
|
||||
featBytes := make([]byte, 0, 1024)
|
||||
var taxid int
|
||||
inSeq := false
|
||||
|
||||
for {
|
||||
line := scanner.ReadLine()
|
||||
if line == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if inSeq {
|
||||
// Should not happen — extractEmblSeq consumed up to "//"
|
||||
inSeq = false
|
||||
continue
|
||||
}
|
||||
|
||||
switch {
|
||||
case bytes.HasPrefix(line, []byte("ID ")):
|
||||
id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])
|
||||
case bytes.HasPrefix(line, []byte("OS ")):
|
||||
scientificName = string(bytes.TrimSpace(line[5:]))
|
||||
case bytes.HasPrefix(line, []byte("DE ")):
|
||||
if len(defBytes) > 0 {
|
||||
defBytes = append(defBytes, ' ')
|
||||
}
|
||||
defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)
|
||||
case withFeatureTable && bytes.HasPrefix(line, []byte("FH ")):
|
||||
featBytes = append(featBytes, line...)
|
||||
case withFeatureTable && bytes.Equal(line, []byte("FH")):
|
||||
featBytes = append(featBytes, '\n')
|
||||
featBytes = append(featBytes, line...)
|
||||
case bytes.HasPrefix(line, []byte("FT ")):
|
||||
if withFeatureTable {
|
||||
featBytes = append(featBytes, '\n')
|
||||
featBytes = append(featBytes, line...)
|
||||
}
|
||||
if bytes.HasPrefix(line, []byte(`FT /db_xref="taxon:`)) {
|
||||
rest := line[37:]
|
||||
end := bytes.IndexByte(rest, '"')
|
||||
if end > 0 {
|
||||
taxid, _ = strconv.Atoi(string(rest[:end]))
|
||||
}
|
||||
}
|
||||
case bytes.HasPrefix(line, []byte(" ")):
|
||||
// First sequence line: extract all bases via extractEmblSeq,
|
||||
// which also consumes this line's remaining content.
|
||||
// But ReadLine already consumed this line — we need to process it
|
||||
// plus subsequent lines. Process this line inline then call helper.
|
||||
seqDest := make([]byte, 0, 4096)
|
||||
for _, b := range line {
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 'a' - 'A'
|
||||
}
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
if b >= 'a' && b <= 'z' {
|
||||
seqDest = append(seqDest, b)
|
||||
}
|
||||
}
|
||||
seqDest = scanner.extractEmblSeq(seqDest, UtoT)
|
||||
|
||||
seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))
|
||||
seq.SetSource(source)
|
||||
if withFeatureTable {
|
||||
seq.SetFeatures(featBytes)
|
||||
}
|
||||
annot := seq.Annotations()
|
||||
annot["scientific_name"] = scientificName
|
||||
annot["taxid"] = taxid
|
||||
sequences = append(sequences, seq)
|
||||
|
||||
// Reset state
|
||||
id = ""
|
||||
scientificName = ""
|
||||
defBytes = defBytes[:0]
|
||||
featBytes = featBytes[:0]
|
||||
taxid = 1
|
||||
|
||||
case bytes.Equal(line, []byte("//")):
|
||||
// record ended without SQ/sequence section (e.g. WGS entries)
|
||||
if id != "" {
|
||||
seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))
|
||||
seq.SetSource(source)
|
||||
if withFeatureTable {
|
||||
seq.SetFeatures(featBytes)
|
||||
}
|
||||
annot := seq.Annotations()
|
||||
annot["scientific_name"] = scientificName
|
||||
annot["taxid"] = taxid
|
||||
sequences = append(sequences, seq)
|
||||
}
|
||||
id = ""
|
||||
scientificName = ""
|
||||
defBytes = defBytes[:0]
|
||||
featBytes = featBytes[:0]
|
||||
taxid = 1
|
||||
}
|
||||
}
|
||||
|
||||
return sequences, nil
|
||||
}
|
||||
|
||||
func _ParseEmblFile(
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
@@ -171,7 +314,14 @@ func _ParseEmblFile(
|
||||
|
||||
for chunks := range input {
|
||||
order := chunks.Order
|
||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||
var sequences obiseq.BioSequenceSlice
|
||||
var err error
|
||||
|
||||
if chunks.Rope != nil {
|
||||
sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
|
||||
} else {
|
||||
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
|
||||
@@ -196,7 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
|
||||
1024*1024*128,
|
||||
EndOfLastFlatFileEntry,
|
||||
"\nID ",
|
||||
true,
|
||||
false,
|
||||
)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
@@ -209,28 +209,121 @@ func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlic
|
||||
return parser
|
||||
}
|
||||
|
||||
// extractFastaSeq scans sequence bytes from the rope directly into dest,
|
||||
// appending valid nucleotide characters and skipping whitespace.
|
||||
// Stops when '>' is found at the start of a line (next record) or at EOF.
|
||||
// Returns (dest with appended bases, hasMore).
|
||||
// hasMore=true means scanner is now positioned at '>' of the next record.
|
||||
func (s *ropeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) {
|
||||
lineStart := true
|
||||
|
||||
for s.current != nil {
|
||||
data := s.current.data[s.pos:]
|
||||
for i, b := range data {
|
||||
if lineStart && b == '>' {
|
||||
s.pos += i
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return dest, true
|
||||
}
|
||||
if b == '\n' || b == '\r' {
|
||||
lineStart = true
|
||||
continue
|
||||
}
|
||||
lineStart = false
|
||||
if b == ' ' || b == '\t' {
|
||||
continue
|
||||
}
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 'a' - 'A'
|
||||
}
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
dest = append(dest, b)
|
||||
}
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return dest, false
|
||||
}
|
||||
|
||||
// FastaChunkParserRope parses a FASTA chunk directly from the rope without Pack().
|
||||
func FastaChunkParserRope(source string, rope *PieceOfChunk, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
scanner := newRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
for {
|
||||
bline := scanner.ReadLine()
|
||||
if bline == nil {
|
||||
break
|
||||
}
|
||||
if len(bline) == 0 || bline[0] != '>' {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse header: ">id definition"
|
||||
header := bline[1:]
|
||||
var id string
|
||||
var definition string
|
||||
sp := bytes.IndexByte(header, ' ')
|
||||
if sp < 0 {
|
||||
sp = bytes.IndexByte(header, '\t')
|
||||
}
|
||||
if sp < 0 {
|
||||
id = string(header)
|
||||
} else {
|
||||
id = string(header[:sp])
|
||||
definition = string(bytes.TrimSpace(header[sp+1:]))
|
||||
}
|
||||
|
||||
seqDest := make([]byte, 0, 4096)
|
||||
var hasMore bool
|
||||
seqDest, hasMore = scanner.extractFastaSeq(seqDest, UtoT)
|
||||
|
||||
if len(seqDest) == 0 {
|
||||
log.Fatalf("%s [%s]: sequence is empty", source, id)
|
||||
}
|
||||
|
||||
seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
|
||||
seq.SetSource(source)
|
||||
sequences = append(sequences, seq)
|
||||
|
||||
if !hasMore {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return sequences, nil
|
||||
}
|
||||
|
||||
func _ParseFastaFile(
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
UtoT bool,
|
||||
) {
|
||||
|
||||
parser := FastaChunkParser(UtoT)
|
||||
|
||||
for chunks := range input {
|
||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||
// obilog.Warnf("Chunck(%d:%d) -%d- ", chunks.Order, l, sequences.Len())
|
||||
var sequences obiseq.BioSequenceSlice
|
||||
var err error
|
||||
|
||||
if chunks.Rope != nil {
|
||||
sequences, err = FastaChunkParserRope(chunks.Source, chunks.Rope, UtoT)
|
||||
} else {
|
||||
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
|
||||
}
|
||||
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
|
||||
|
||||
}
|
||||
|
||||
out.Done()
|
||||
|
||||
}
|
||||
|
||||
func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
@@ -245,7 +338,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
1024*1024,
|
||||
EndOfLastFastaEntry,
|
||||
"\n>",
|
||||
true,
|
||||
false,
|
||||
)
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
|
||||
@@ -303,6 +303,80 @@ func FastqChunkParser(quality_shift byte, with_quality bool, UtoT bool) func(str
|
||||
return parser
|
||||
}
|
||||
|
||||
// FastqChunkParserRope parses a FASTQ chunk directly from a rope without Pack().
|
||||
func FastqChunkParserRope(source string, rope *PieceOfChunk, quality_shift byte, with_quality, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
scanner := newRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
for {
|
||||
// Line 1: @id [definition]
|
||||
hline := scanner.ReadLine()
|
||||
if hline == nil {
|
||||
break
|
||||
}
|
||||
if len(hline) == 0 || hline[0] != '@' {
|
||||
continue
|
||||
}
|
||||
header := hline[1:]
|
||||
var id string
|
||||
var definition string
|
||||
sp := bytes.IndexByte(header, ' ')
|
||||
if sp < 0 {
|
||||
sp = bytes.IndexByte(header, '\t')
|
||||
}
|
||||
if sp < 0 {
|
||||
id = string(header)
|
||||
} else {
|
||||
id = string(header[:sp])
|
||||
definition = string(bytes.TrimSpace(header[sp+1:]))
|
||||
}
|
||||
|
||||
// Line 2: sequence
|
||||
sline := scanner.ReadLine()
|
||||
if sline == nil {
|
||||
log.Fatalf("@%s[%s]: unexpected EOF after header", id, source)
|
||||
}
|
||||
seqDest := make([]byte, len(sline))
|
||||
w := 0
|
||||
for _, b := range sline {
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 'a' - 'A'
|
||||
}
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
seqDest[w] = b
|
||||
w++
|
||||
}
|
||||
seqDest = seqDest[:w]
|
||||
if len(seqDest) == 0 {
|
||||
log.Fatalf("@%s[%s]: sequence is empty", id, source)
|
||||
}
|
||||
|
||||
// Line 3: + (skip)
|
||||
scanner.ReadLine()
|
||||
|
||||
// Line 4: quality
|
||||
qline := scanner.ReadLine()
|
||||
|
||||
seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
|
||||
seq.SetSource(source)
|
||||
|
||||
if with_quality && qline != nil {
|
||||
qDest := make([]byte, len(qline))
|
||||
copy(qDest, qline)
|
||||
for i := range qDest {
|
||||
qDest[i] -= quality_shift
|
||||
}
|
||||
seq.TakeQualities(qDest)
|
||||
}
|
||||
|
||||
sequences = append(sequences, seq)
|
||||
}
|
||||
|
||||
return sequences, nil
|
||||
}
|
||||
|
||||
func _ParseFastqFile(
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
@@ -313,7 +387,14 @@ func _ParseFastqFile(
|
||||
parser := FastqChunkParser(quality_shift, with_quality, UtoT)
|
||||
|
||||
for chunks := range input {
|
||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||
var sequences obiseq.BioSequenceSlice
|
||||
var err error
|
||||
|
||||
if chunks.Rope != nil {
|
||||
sequences, err = FastqChunkParserRope(chunks.Source, chunks.Rope, quality_shift, with_quality, UtoT)
|
||||
} else {
|
||||
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("File %s : Cannot parse the fastq file : %v", chunks.Source, err)
|
||||
@@ -339,7 +420,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
1024*1024,
|
||||
EndOfLastFastqEntry,
|
||||
"\n@",
|
||||
true,
|
||||
false,
|
||||
)
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
|
||||
@@ -296,7 +296,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
|
||||
case strings.HasSuffix(skey, "_taxid"):
|
||||
if dataType == jsonparser.Number || dataType == jsonparser.String {
|
||||
rank, _ := obiutils.SplitInTwo(skey, '_')
|
||||
rank := skey[:len(skey)-len("_taxid")]
|
||||
|
||||
taxid := string(value)
|
||||
sequence.SetTaxid(taxid, rank)
|
||||
|
||||
@@ -29,70 +29,11 @@ const (
|
||||
|
||||
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
|
||||
|
||||
// gbRopeScanner reads lines from a PieceOfChunk rope without heap allocation.
|
||||
// The carry buffer (stack) handles lines that span two rope nodes.
|
||||
type gbRopeScanner struct {
|
||||
current *PieceOfChunk
|
||||
pos int
|
||||
carry [256]byte // max GenBank line = 80 chars; 256 gives ample margin
|
||||
carryN int
|
||||
}
|
||||
|
||||
func newGbRopeScanner(rope *PieceOfChunk) *gbRopeScanner {
|
||||
return &gbRopeScanner{current: rope}
|
||||
}
|
||||
|
||||
// ReadLine returns the next line without the trailing \n (or \r\n).
|
||||
// Returns nil at end of rope. The returned slice aliases carry[] or the node
|
||||
// data and is valid only until the next ReadLine call.
|
||||
func (s *gbRopeScanner) ReadLine() []byte {
|
||||
for {
|
||||
if s.current == nil {
|
||||
if s.carryN > 0 {
|
||||
n := s.carryN
|
||||
s.carryN = 0
|
||||
return s.carry[:n]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
data := s.current.data[s.pos:]
|
||||
idx := bytes.IndexByte(data, '\n')
|
||||
|
||||
if idx >= 0 {
|
||||
var line []byte
|
||||
if s.carryN == 0 {
|
||||
line = data[:idx]
|
||||
} else {
|
||||
n := copy(s.carry[s.carryN:], data[:idx])
|
||||
s.carryN += n
|
||||
line = s.carry[:s.carryN]
|
||||
s.carryN = 0
|
||||
}
|
||||
s.pos += idx + 1
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
if len(line) > 0 && line[len(line)-1] == '\r' {
|
||||
line = line[:len(line)-1]
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
// No \n in this node: accumulate into carry and advance
|
||||
n := copy(s.carry[s.carryN:], data)
|
||||
s.carryN += n
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
}
|
||||
|
||||
// extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
|
||||
// appending compacted bases to dest. Returns the extended slice.
|
||||
// Stops and returns when "//" is found at the start of a line.
|
||||
// The scanner is left positioned after the "//" line.
|
||||
func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte {
|
||||
func (s *ropeScanner) extractSequence(dest []byte, UtoT bool) []byte {
|
||||
lineStart := true
|
||||
skipDigits := true
|
||||
|
||||
@@ -139,24 +80,6 @@ func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte {
|
||||
return dest
|
||||
}
|
||||
|
||||
// skipToNewline advances the scanner past the next '\n'.
|
||||
func (s *gbRopeScanner) skipToNewline() {
|
||||
for s.current != nil {
|
||||
data := s.current.data[s.pos:]
|
||||
idx := bytes.IndexByte(data, '\n')
|
||||
if idx >= 0 {
|
||||
s.pos += idx + 1
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return
|
||||
}
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
}
|
||||
|
||||
// parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
|
||||
// Format: "LOCUS <id> <length> bp ..."
|
||||
// Returns -1 if not found or parse error.
|
||||
@@ -205,7 +128,7 @@ func GenbankChunkParserRope(source string, rope *PieceOfChunk,
|
||||
withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
state := inHeader
|
||||
scanner := newGbRopeScanner(rope)
|
||||
scanner := newRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
id := ""
|
||||
|
||||
77
pkg/obiformats/rope_scanner.go
Normal file
77
pkg/obiformats/rope_scanner.go
Normal file
@@ -0,0 +1,77 @@
|
||||
package obiformats
|
||||
|
||||
import "bytes"
|
||||
|
||||
// ropeScanner reads lines from a PieceOfChunk rope.
|
||||
// The carry buffer handles lines that span two rope nodes; it grows as needed.
|
||||
type ropeScanner struct {
|
||||
current *PieceOfChunk
|
||||
pos int
|
||||
carry []byte
|
||||
}
|
||||
|
||||
func newRopeScanner(rope *PieceOfChunk) *ropeScanner {
|
||||
return &ropeScanner{current: rope}
|
||||
}
|
||||
|
||||
// ReadLine returns the next line without the trailing \n (or \r\n).
|
||||
// Returns nil at end of rope. The returned slice aliases carry[] or the node
|
||||
// data and is valid only until the next ReadLine call.
|
||||
func (s *ropeScanner) ReadLine() []byte {
|
||||
for {
|
||||
if s.current == nil {
|
||||
if len(s.carry) > 0 {
|
||||
line := s.carry
|
||||
s.carry = s.carry[:0]
|
||||
return line
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
data := s.current.data[s.pos:]
|
||||
idx := bytes.IndexByte(data, '\n')
|
||||
|
||||
if idx >= 0 {
|
||||
var line []byte
|
||||
if len(s.carry) == 0 {
|
||||
line = data[:idx]
|
||||
} else {
|
||||
s.carry = append(s.carry, data[:idx]...)
|
||||
line = s.carry
|
||||
s.carry = s.carry[:0]
|
||||
}
|
||||
s.pos += idx + 1
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
if len(line) > 0 && line[len(line)-1] == '\r' {
|
||||
line = line[:len(line)-1]
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
// No \n in this node: accumulate into carry and advance
|
||||
s.carry = append(s.carry, data...)
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
}
|
||||
|
||||
// skipToNewline advances the scanner past the next '\n'.
|
||||
func (s *ropeScanner) skipToNewline() {
|
||||
for s.current != nil {
|
||||
data := s.current.data[s.pos:]
|
||||
idx := bytes.IndexByte(data, '\n')
|
||||
if idx >= 0 {
|
||||
s.pos += idx + 1
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return
|
||||
}
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,7 @@ package obioptions
|
||||
// Version is automatically updated by the Makefile from version.txt
|
||||
// The patch number (third digit) is incremented on each push to the repository
|
||||
|
||||
var _Version = "Release 4.4.19"
|
||||
var _Version = "Release 4.4.20"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
//
|
||||
|
||||
@@ -480,6 +480,15 @@ func (s *BioSequence) SetQualities(qualities Quality) {
|
||||
s.qualities = CopySlice(qualities)
|
||||
}
|
||||
|
||||
// TakeQualities stores the slice directly without copying.
|
||||
// The caller must not use the slice after this call.
|
||||
func (s *BioSequence) TakeQualities(qualities Quality) {
|
||||
if s.qualities != nil {
|
||||
RecycleSlice(&s.qualities)
|
||||
}
|
||||
s.qualities = qualities
|
||||
}
|
||||
|
||||
// A method that appends a byte slice to the qualities of the BioSequence.
|
||||
func (s *BioSequence) WriteQualities(data []byte) (int, error) {
|
||||
s.qualities = append(s.qualities, data...)
|
||||
|
||||
@@ -195,7 +195,7 @@ func (s *BioSequenceSlice) ExtractTaxonomy(taxonomy *obitax.Taxonomy, seqAsTaxa
|
||||
return nil, fmt.Errorf("sequence %v has no path", s.Id())
|
||||
}
|
||||
last := path[len(path)-1]
|
||||
taxname, _ := obiutils.SplitInTwo(last, ':')
|
||||
taxname, _ := obiutils.LeftSplitInTwo(last, ':')
|
||||
if idx, ok := s.GetIntAttribute("seq_number"); !ok {
|
||||
return nil, errors.New("sequences are not numbered")
|
||||
} else {
|
||||
|
||||
@@ -31,7 +31,7 @@ func NewTaxidFactory(code string, alphabet obiutils.AsciiSet) *TaxidFactory {
|
||||
// It extracts the relevant part of the string after the first colon (':') if present.
|
||||
func (f *TaxidFactory) FromString(taxid string) (Taxid, error) {
|
||||
taxid = obiutils.AsciiSpaceSet.TrimLeft(taxid)
|
||||
part1, part2 := obiutils.SplitInTwo(taxid, ':')
|
||||
part1, part2 := obiutils.LeftSplitInTwo(taxid, ':')
|
||||
if len(part2) == 0 {
|
||||
taxid = part1
|
||||
} else {
|
||||
|
||||
@@ -64,7 +64,7 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
||||
fmt.Println(err)
|
||||
}
|
||||
|
||||
destfile, err := obiutils.CompressStream(file, true, true)
|
||||
destfile, err := obiutils.CompressStream(file, compressed, true)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
|
||||
@@ -144,7 +144,7 @@ func (r *AsciiSet) TrimLeft(s string) string {
|
||||
return s[i:]
|
||||
}
|
||||
|
||||
func SplitInTwo(s string, sep byte) (string, string) {
|
||||
func LeftSplitInTwo(s string, sep byte) (string, string) {
|
||||
i := 0
|
||||
for ; i < len(s); i++ {
|
||||
c := s[i]
|
||||
@@ -157,3 +157,17 @@ func SplitInTwo(s string, sep byte) (string, string) {
|
||||
}
|
||||
return s[:i], s[i+1:]
|
||||
}
|
||||
|
||||
func RightSplitInTwo(s string, sep byte) (string, string) {
|
||||
i := len(s) - 1
|
||||
for ; i >= 0; i-- {
|
||||
c := s[i]
|
||||
if c == sep {
|
||||
break
|
||||
}
|
||||
}
|
||||
if i == len(s) {
|
||||
return s, ""
|
||||
}
|
||||
return s[:i], s[i+1:]
|
||||
}
|
||||
|
||||
36
tools/json2md.py
Executable file
36
tools/json2md.py
Executable file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Read potentially malformed JSON from stdin (aichat output), extract title and
|
||||
body, and print them as plain text: title on first line, blank line, then body.
|
||||
Exits with 1 on failure (no output).
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
|
||||
text = sys.stdin.read()
|
||||
|
||||
m = re.search(r'\{.*\}', text, re.DOTALL)
|
||||
if not m:
|
||||
sys.exit(1)
|
||||
|
||||
s = m.group()
|
||||
obj = None
|
||||
|
||||
try:
|
||||
obj = json.loads(s)
|
||||
except Exception:
|
||||
s2 = re.sub(r'(?<!\\)\n', r'\\n', s)
|
||||
try:
|
||||
obj = json.loads(s2)
|
||||
except Exception:
|
||||
sys.exit(1)
|
||||
|
||||
title = obj.get('title', '').strip()
|
||||
body = obj.get('body', '').strip()
|
||||
|
||||
if not title or not body:
|
||||
sys.exit(1)
|
||||
|
||||
print(f"{title}\n\n{body}")
|
||||
@@ -1 +1 @@
|
||||
4.4.19
|
||||
4.4.20
|
||||
|
||||
Reference in New Issue
Block a user