From f14860a4860a762c575b6af72e177dcd7324c089 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 27 May 2022 11:53:29 +0300 Subject: [PATCH] Patch header parting and formatiing --- Makefile | 8 +++++- cmd/test/main.go | 36 ++++++++++++++------------- pkg/obichunk/chunk_on_disk.go | 8 +----- pkg/obiformats/fastseq_json_header.go | 14 ++++++++++- pkg/obiformats/fastseq_obi_header.go | 27 ++++++++++++++++++-- pkg/obiiter/batchiterator.go | 12 +++++++++ pkg/obiseq/biosequence.go | 1 + pkg/obiseq/merge.go | 14 ++++++++++- 8 files changed, 91 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index 4b9b873..2f38198 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ OBITOOLS_SRC:= $(wildcard cmd/obitools/*/*.go) OBITOOLS_DIRS:=$(sort $(patsubst %/,%,$(dir $(OBITOOLS_SRC)))) OBITOOLS:=$(notdir $(OBITOOLS_DIRS)) -.PHONY: all +.PHONY: all obitools define MAKE_PKG_RULE pkg-$(notdir $(1)): $(1) @@ -43,6 +43,12 @@ all: obitools packages: $(patsubst %,pkg-%,$(PACKAGES)) obitools: $(patsubst %,$(OBITOOLS_PREFIX)%,$(OBITOOLS)) + +macos-pkg: + @bash pkgs/macos/macos-installer-builder-master/macOS-x64/build-macos-x64.sh \ + OBITools \ + 0.0.1 + $(BUILD_DIR): mkdir -p $@ diff --git a/cmd/test/main.go b/cmd/test/main.go index 5e30b1a..97a9e9c 100644 --- a/cmd/test/main.go +++ b/cmd/test/main.go @@ -1,14 +1,13 @@ package main import ( - "fmt" "log" "os" "runtime/trace" - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" - - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiclean" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" ) func main() { @@ -20,13 +19,16 @@ func main() { trace.Start(ftrace) defer trace.Stop() - // option_parser := obioptions.GenerateOptionParser( - // obiconvert.InputOptionSet, - // ) + option_parser := obioptions.GenerateOptionParser( + obiconvert.InputOptionSet, + ) - //_, args, _ := option_parser(os.Args) + _, args, _ := option_parser(os.Args) + + fs, _ := obiconvert.ReadBioSequencesBatch(args...) + + obiclean.IOBIClean(fs) - // fs, _ := obiconvert.ReadBioSequences(args...) // buffer := make([]byte, 0) // fs.Next() // s := fs.Get() @@ -41,19 +43,19 @@ func main() { // fmt.Printf("Shift : %d Score : %d\n", maxshift, maxcount) // } - A := []byte("ccgcctccttagaacaggctcctctagaaaaccatgtgggatatctaaagaaggcggagatagaaagagcggttcagcaggaatgccgagatggacggcgtgtgacg") - B := []byte("ccgcctccttagaacaggctcctctagaaaaaccatgtgggatatctaaagaaggcggagatagaaagagcggttcagcaggaatgccgagatggacggcgtgtgacg") + // A := []byte("ccgcctccttagaacaggctcctctagaaaaccatgtgggatatctaaagaaggcggagatagaaagagcggttcagcaggaatgccgagatggacggcgtgtgacg") + // B := []byte("ccgcctccttagaacaggctcctctagaaaaaccatgtgggatatctaaagaaggcggagatagaaagagcggttcagcaggaatgccgagatggacggcgtgtgacg") // B := []byte("cgccaccaccgagatctacactctttccctacacgacgctcttccgatctccgcctccttagaacaggctcctctagaaaagcatagtggggtatctaaaggaggcgg") - sA := obiseq.NewBioSequence("A", A, "") - sB := obiseq.MakeBioSequence("B", B, "") + // sA := obiseq.NewBioSequence("A", A, "") + // sB := obiseq.MakeBioSequence("B", B, "") - s, l := obialign.LCSScore(sA, &sB, 2, nil) + // s, l := obialign.LCSScore(sA, &sB, 2, nil) - fmt.Printf("score : %d length : %d error : %d\n", s, l, l-s) + // fmt.Printf("score : %d length : %d error : %d\n", s, l, l-s) - s, l = obialign.LCSScore(&sB, &sB, 2, nil) + // s, l = obialign.LCSScore(&sB, &sB, 2, nil) - fmt.Printf("score : %d length : %d error : %d\n", s, l, l-s) + // fmt.Printf("score : %d length : %d error : %d\n", s, l, l-s) // pat, _ := obiapat.MakeApatPattern("TCCTTCCAACAGGCTCCTC", 3) // as, _ := obiapat.MakeApatSequence(sA, false) diff --git a/pkg/obichunk/chunk_on_disk.go b/pkg/obichunk/chunk_on_disk.go index 1c15cc5..07d8868 100644 --- a/pkg/obichunk/chunk_on_disk.go +++ b/pkg/obichunk/chunk_on_disk.go @@ -81,13 +81,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequenceBatch, panic(err) } - //chunck := make(obiseq.BioSequenceSlice, 0, 10000) - chunck := obiseq.MakeBioSequenceSlice() - for iseq.Next() { - b := iseq.Get() - chunck = append(chunck, b.Slice()...) - b.Recycle() - } + chunck := iseq.Load() newIter.Push(obiiter.MakeBioSequenceBatch(order, chunck)) log.Infof("Start processing of batch %d/%d : %d sequences", diff --git a/pkg/obiformats/fastseq_json_header.go b/pkg/obiformats/fastseq_json_header.go index 0515fde..a8cefbf 100644 --- a/pkg/obiformats/fastseq_json_header.go +++ b/pkg/obiformats/fastseq_json_header.go @@ -1,9 +1,11 @@ package obiformats import ( - log "github.com/sirupsen/logrus" + "math" "strings" + log "github.com/sirupsen/logrus" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "github.com/goccy/go-json" ) @@ -42,6 +44,16 @@ func _parse_json_header_(header string, annotations obiseq.Annotation) string { stop++ err := json.Unmarshal([]byte(header)[start:stop], &annotations) + + for k, v := range annotations { + switch vt := v.(type) { + case float64 : + if vt == math.Floor(vt) { + annotations[k] = int(vt) + } + } + } + if err != nil { log.Fatalf("annotation parsing error on %s : %v\n", header, err) } diff --git a/pkg/obiformats/fastseq_obi_header.go b/pkg/obiformats/fastseq_obi_header.go index 5c709aa..dd29d31 100644 --- a/pkg/obiformats/fastseq_obi_header.go +++ b/pkg/obiformats/fastseq_obi_header.go @@ -3,6 +3,8 @@ package obiformats import ( "bytes" "fmt" + "log" + "math" "regexp" "strconv" "strings" @@ -251,9 +253,20 @@ func ParseOBIFeatures(text string, annotations obiseq.Annotation) string { } // End of not string } // End of not numeric - annotations[key] = value + switch vt := value.(type) { + case float64: + if vt == math.Floor(vt) { + annotations[key] = int(vt) + } + default: + annotations[key] = value + } - d = part[stop:] + if stop < len(part) { + d = part[stop:] + } else { + d = []byte{} + } //m = __obi_header_key_pattern__.FindIndex(d) m = __match__key__(d) } @@ -280,6 +293,16 @@ func FormatFastSeqOBIHeader(sequence *obiseq.BioSequence) string { switch t := value.(type) { case string: text.WriteString(fmt.Sprintf("%s=%s; ", key, t)) + case map[string]int, + map[string]interface{}: + tv, err := json.Marshal(t) + if err != nil { + log.Fatalf("Cannot convert %v value", value) + } + tv = bytes.ReplaceAll(tv, []byte(`"`), []byte("'")) + text.WriteString(fmt.Sprintf("%s=", key)) + text.Write(tv) + text.WriteString("; ") default: text.WriteString(fmt.Sprintf("%s=%v; ", key, value)) } diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index da67db4..910aa36 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -612,3 +612,15 @@ func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate, return trueIter.Rebatch(size) } + +func (iterator IBioSequenceBatch) Load() obiseq.BioSequenceSlice { + + chunck := obiseq.MakeBioSequenceSlice() + for iterator.Next() { + b := iterator.Get() + chunck = append(chunck, b.Slice()...) + b.Recycle() + } + + return chunck +} \ No newline at end of file diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 318bfc2..56d27ec 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -257,3 +257,4 @@ func (s *BioSequence) WriteByte(data byte) error { s.sequence = append(s.sequence, data) return nil } + diff --git a/pkg/obiseq/merge.go b/pkg/obiseq/merge.go index ffe548c..29bb581 100644 --- a/pkg/obiseq/merge.go +++ b/pkg/obiseq/merge.go @@ -2,8 +2,10 @@ package obiseq import ( "fmt" - log "github.com/sirupsen/logrus" "strings" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + log "github.com/sirupsen/logrus" ) type StatsOnValues map[string]int @@ -33,6 +35,16 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { case StatsOnValues: stats = istat newstat = false + case map[string]interface{}: + stats = make(StatsOnValues, len(istat)) + var err error + for k, v := range istat { + stats[k], err = goutils.InterfaceToInt(v) + if err != nil { + log.Panicf("In sequence %s : %s stat tag not only containing integer values %s", + sequence.Id(), mkey, istat) + } + } default: stats = make(StatsOnValues, 100) annotations[mkey] = stats