From 7bd073ccd4d6889b94f5fce6fa799f6fc56562d1 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 3 Mar 2024 11:16:24 -0400 Subject: [PATCH] First version of obisplit and patch a bug in the new workers API Former-commit-id: f28af9f104c08d68e29fd866739d8dd58241da63 --- cmd/obitools/obisplit/main.go | 57 ++++++ pkg/obiapat/pattern.go | 36 +++- pkg/obiiter/workers.go | 4 +- pkg/obiseq/worker.go | 53 ++--- pkg/obitools/obiannotate/obiannotate.go | 2 +- pkg/obitools/obisplit/obisplit.go | 257 ++++++++++++++++++++++++ pkg/obitools/obisplit/options.go | 135 +++++++++++++ 7 files changed, 507 insertions(+), 37 deletions(-) create mode 100644 cmd/obitools/obisplit/main.go create mode 100644 pkg/obitools/obisplit/obisplit.go create mode 100644 pkg/obitools/obisplit/options.go diff --git a/cmd/obitools/obisplit/main.go b/cmd/obitools/obisplit/main.go new file mode 100644 index 0000000..95a402b --- /dev/null +++ b/cmd/obitools/obisplit/main.go @@ -0,0 +1,57 @@ +package main + +import ( + "fmt" + "os" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obisplit" +) + +func main() { + + defer obiseq.LogBioSeqStatus() + + // go tool pprof -http=":8000" ./obipairing ./cpu.pprof + // f, err := os.Create("cpu.pprof") + // if err != nil { + // log.Fatal(err) + // } + // pprof.StartCPUProfile(f) + // defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() + + optionParser := obioptions.GenerateOptionParser(obisplit.OptionSet) + + _, args := optionParser(os.Args) + + if obisplit.CLIAskConfigTemplate() { + fmt.Print(obisplit.CLIConfigTemplate()) + os.Exit(0) + } + + sequences, err := obiconvert.CLIReadBioSequences(args...) + + if err != nil { + log.Errorf("Cannot open file (%v)", err) + os.Exit(1) + } + + annotator := obisplit.CLISlitPipeline() + obiconvert.CLIWriteBioSequences(sequences.Pipe(annotator), true) + + obiiter.WaitForLastPipe() + +} diff --git a/pkg/obiapat/pattern.go b/pkg/obiapat/pattern.go index bf955bb..6b6526b 100644 --- a/pkg/obiapat/pattern.go +++ b/pkg/obiapat/pattern.go @@ -214,7 +214,7 @@ func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...Ap var errmsg *C.char if apat_p != nil && apat_p.pointer != nil { - log.Debugf("Finaliser called on %p\n", apat_p.pointer) + // log.Debugf("Finaliser called on %p\n", apat_p.pointer) C.delete_apatseq(apat_p.pointer, &errno, &errmsg) } }) @@ -376,3 +376,37 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) ( // func AllocatedApaSequences() int { // return int(_AllocatedApaSequences) // } + +func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int) (loc [][3]int) { + res := pattern.FindAllIndex(sequence, begin, length) + + sbuffer := [(int(C.MAX_PAT_LEN) + int(C.MAX_PAT_ERR) + 1) * (int(C.MAX_PAT_LEN) + 1)]uint64{} + buffer := sbuffer[:] + + for _, m := range res { + if m[2] > 0 && pattern.pointer.pointer.hasIndel { + start := m[0] - m[2] + end := m[0] + int(pattern.pointer.pointer.patlen) + m[2] + start = obiutils.MaxInt(start, 0) + end = obiutils.MinInt(end, sequence.Len()) + + cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat)) + frg := sequence.pointer.reference.Sequence()[start:end] + + score, lali := obialign.FastLCSEGFScoreByte( + frg, + (*cpattern)[0:int(pattern.pointer.pointer.patlen)], + m[2], true, &buffer) + + // log.Debugf("seq[%d] : %s %d, %d", i, sequence.pointer.reference.Id(), score, lali) + + m[2] = lali - score + m[0] = m[0] + int(pattern.pointer.pointer.patlen) - lali + m[1] = m[0] + lali + } + } + + log.Debugf("All matches : %v", res) + + return res +} diff --git a/pkg/obiiter/workers.go b/pkg/obiiter/workers.go index 00ed1db..257cdae 100644 --- a/pkg/obiiter/workers.go +++ b/pkg/obiiter/workers.go @@ -34,7 +34,7 @@ func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, }() - sw := obiseq.SeqToSliceWorker(worker, true, breakOnError) + sw := obiseq.SeqToSliceWorker(worker, breakOnError) f := func(iterator IBioSequence) { var err error @@ -90,7 +90,7 @@ func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePre }() - sw := obiseq.SeqToSliceConditionalWorker(predicate, worker, true, breakOnError) + sw := obiseq.SeqToSliceConditionalWorker(predicate, worker, breakOnError) f := func(iterator IBioSequence) { var err error diff --git a/pkg/obiseq/worker.go b/pkg/obiseq/worker.go index df517d7..442650d 100644 --- a/pkg/obiseq/worker.go +++ b/pkg/obiseq/worker.go @@ -25,37 +25,27 @@ func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker { } func SeqToSliceWorker(worker SeqWorker, - inplace, breakOnError bool) SeqSliceWorker { + breakOnError bool) SeqSliceWorker { var f SeqSliceWorker if worker == nil { - if inplace { - f = func(input BioSequenceSlice) (BioSequenceSlice, error) { - return input, nil - } - } else { - f = func(input BioSequenceSlice) (BioSequenceSlice, error) { - output := MakeBioSequenceSlice(len(input)) - copy(output, input) - return output, nil - } + f = func(input BioSequenceSlice) (BioSequenceSlice, error) { + return input, nil } } else { f = func(input BioSequenceSlice) (BioSequenceSlice, error) { - output := input - if !inplace { - output = MakeBioSequenceSlice(len(input)) - } + output := MakeBioSequenceSlice(len(input)) i := 0 for _, s := range input { r, err := worker(s) if err == nil { for _, rs := range r { + if i == len(output) { + output = slices.Grow(output, cap(output)) + output = output[:cap(output)] + } output[i] = rs i++ - if i == cap(output) { - slices.Grow(output, cap(output)) - } } } else { @@ -64,8 +54,8 @@ func SeqToSliceWorker(worker SeqWorker, s.Id(), err) return BioSequenceSlice{}, err } else { - log.Warnf("got an error on sequence %s processing", - s.Id()) + log.Warnf("got an error on sequence %s processing : %v", + s.Id(), err) } } } @@ -80,18 +70,14 @@ func SeqToSliceWorker(worker SeqWorker, func SeqToSliceConditionalWorker( condition SequencePredicate, - worker SeqWorker, - inplace, breakOnError bool) SeqSliceWorker { + worker SeqWorker, breakOnError bool) SeqSliceWorker { if condition == nil { - return SeqToSliceWorker(worker, inplace, breakOnError) + return SeqToSliceWorker(worker, breakOnError) } f := func(input BioSequenceSlice) (BioSequenceSlice, error) { - output := input - if !inplace { - output = MakeBioSequenceSlice(len(input)) - } + output := MakeBioSequenceSlice(len(input)) i := 0 @@ -100,11 +86,12 @@ func SeqToSliceConditionalWorker( r, err := worker(s) if err == nil { for _, rs := range r { + if i == len(output) { + output = slices.Grow(output, cap(output)) + output = output[:cap(output)] + } output[i] = rs i++ - if i == cap(output) { - slices.Grow(output, cap(output)) - } } } else { if breakOnError { @@ -112,8 +99,8 @@ func SeqToSliceConditionalWorker( s.Id(), err) return BioSequenceSlice{}, err } else { - log.Warnf("got an error on sequence %s processing", - s.Id()) + log.Warnf("got an error on sequence %s processing : %v", + s.Id(), err) } } } @@ -134,7 +121,7 @@ func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker { } } - sw := SeqToSliceWorker(next, true, false) + sw := SeqToSliceWorker(next, false) f := func(seq *BioSequence) (BioSequenceSlice, error) { if seq == nil { diff --git a/pkg/obitools/obiannotate/obiannotate.go b/pkg/obitools/obiannotate/obiannotate.go index 274896f..a3b48c8 100644 --- a/pkg/obitools/obiannotate/obiannotate.go +++ b/pkg/obitools/obiannotate/obiannotate.go @@ -313,7 +313,7 @@ func CLIAnnotationPipeline() obiiter.Pipeable { predicate := obigrep.CLISequenceSelectionPredicate() worker := CLIAnnotationWorker() - annotator := obiseq.SeqToSliceConditionalWorker(predicate, worker, true, false) + annotator := obiseq.SeqToSliceConditionalWorker(predicate, worker, false) f := obiiter.SliceWorkerPipe(annotator, false, obioptions.CLIParallelWorkers()) return f diff --git a/pkg/obitools/obisplit/obisplit.go b/pkg/obitools/obisplit/obisplit.go new file mode 100644 index 0000000..136b445 --- /dev/null +++ b/pkg/obitools/obisplit/obisplit.go @@ -0,0 +1,257 @@ +package obisplit + +import ( + "fmt" + "sort" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +type SplitSequence struct { + pattern string + name string + forward_pattern obiapat.ApatPattern + reverse_pattern obiapat.ApatPattern +} + +type Pattern_match struct { + name string + pattern string + match string + begin int + end int + nerrors int + forward bool +} + +func LocatePatterns(sequence *obiseq.BioSequence, + patterns []SplitSequence) []Pattern_match { + + aseq, err := obiapat.MakeApatSequence(sequence, false) + + if err != nil { + log.Fatalf("Cannot index sequence %s for patern matching", sequence.Id()) + } + + res := make([]Pattern_match, 0, 10) + + for _, split := range patterns { + ms := split.forward_pattern.AllMatches(aseq, 0, aseq.Len()) + for _, m := range ms { + m[0] = max(0, m[0]) + m[1] = min(sequence.Len(), m[1]) + match, err := sequence.Subsequence(m[0], m[1], false) + + if err != nil { + log.Fatalf("Cannot extract pattern %s from sequence %s", split.pattern, sequence.Id()) + } + + res = append(res, Pattern_match{ + name: split.name, + pattern: split.pattern, + match: match.String(), + begin: m[0], + end: m[1], + nerrors: m[2], + forward: true, + }) + } + + ms = split.reverse_pattern.AllMatches(aseq, 0, aseq.Len()) + for _, m := range ms { + m[0] = max(0, m[0]) + m[1] = min(sequence.Len(), m[1]) + match, err := sequence.Subsequence(m[0], m[1], false) + + if err != nil { + log.Fatalf("Cannot extract reverse pattern %s from sequence %s", split.pattern, sequence.Id()) + } + + match = match.ReverseComplement(true) + + res = append(res, Pattern_match{ + name: split.name, + pattern: split.pattern, + match: match.String(), + begin: m[0], + end: m[1], + nerrors: m[2], + forward: false, + }) + } + + } + + sort.Slice(res, func(i, j int) bool { + a := res[i].begin + b := res[j].begin + return a < b + }) + + log.Debugf("Sequence %s Raw match : %v", sequence.Id(), res) + if len(res) > 1 { + j := 0 + m1 := res[0] + for _, m2 := range res[1:] { + if m2.begin < m1.end { + if m2.nerrors < m1.nerrors { + m1 = m2 + } + continue + } + res[j] = m1 + m1 = m2 + j++ + } + + res[j] = m1 + res = res[:j+1] + } + + log.Debugf("Sequence %s No overlap match : %v", sequence.Id(), res) + + return res +} + +func SplitPattern(sequence *obiseq.BioSequence, + patterns []SplitSequence) (obiseq.BioSequenceSlice, error) { + + matches := LocatePatterns(sequence, patterns) + + from := Pattern_match{ + name: "5extremity", + pattern: "", + match: "", + begin: 0, + end: 0, + nerrors: 0, + forward: true, + } + + res := obiseq.MakeBioSequenceSlice(10) + nfrag := 0 + res = res[:nfrag] + + for i, to := range matches { + log.Debugf("from : %v to : %v", from, to) + start := from.end + end := to.begin + + if i == 0 && end <= 0 { + from = to + continue + } + + if end > start { + log.Debugf("Extracting fragment %d from sequence %s [%d:%d]", + nfrag+1, sequence.Id(), + start, end, + ) + + sub, err := sequence.Subsequence(start, end, false) + + if err != nil { + return res[:nfrag], + fmt.Errorf("cannot extract fragment %d from sequence %s [%d:%d]", + nfrag+1, sequence.Id(), + start, end, + ) + } + + nfrag++ + sub.SetAttribute("obisplit_frg", nfrag) + + if from.name == to.name { + sub.SetAttribute("obisplit_group", from.name) + } else { + sub.SetAttribute("obisplit_group", fmt.Sprintf("%s-%s", from.name, to.name)) + } + + sub.SetAttribute("obisplit_location", fmt.Sprintf("%d..%d", start, end)) + + sub.SetAttribute("obisplit_right_error", to.nerrors) + sub.SetAttribute("obisplit_left_error", from.nerrors) + + sub.SetAttribute("obisplit_right_pattern", to.pattern) + sub.SetAttribute("obisplit_left_pattern", from.pattern) + + sub.SetAttribute("obisplit_left_match", from.match) + sub.SetAttribute("obisplit_right_match", to.match) + + res = append(res, sub) + + } + from = to + } + + if from.end < sequence.Len() { + to := Pattern_match{ + name: "3extremity", + pattern: "", + match: "", + begin: sequence.Len(), + end: sequence.Len(), + nerrors: 0, + forward: true, + } + + start := from.end + end := to.begin + + sub, err := sequence.Subsequence(start, end, false) + + if err != nil { + return res[:nfrag], + fmt.Errorf("cannot extract last fragment %d from sequence %s [%d:%d]", + nfrag+1, sequence.Id(), + start, end, + ) + } + + nfrag++ + sub.SetAttribute("obisplit_frg", nfrag) + if from.name == to.name { + sub.SetAttribute("obisplit_group", from.name) + } else { + sub.SetAttribute("obisplit_group", fmt.Sprintf("%s-%s", from.name, to.name)) + } + sub.SetAttribute("obisplit_location", fmt.Sprintf("%d..%d", start, end)) + + sub.SetAttribute("obisplit_right_error", to.nerrors) + sub.SetAttribute("obisplit_left_error", from.nerrors) + + sub.SetAttribute("obisplit_right_pattern", to.pattern) + sub.SetAttribute("obisplit_left_pattern", from.pattern) + + sub.SetAttribute("obisplit_left_match", from.match) + sub.SetAttribute("obisplit_right_match", to.match) + + res = append(res, sub) + + } + + return res[:nfrag], nil +} + +func SplitPatternWorker(patterns []SplitSequence) obiseq.SeqWorker { + f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + return SplitPattern(sequence, patterns) + } + + return f +} + +func CLISlitPipeline() obiiter.Pipeable { + + worker := SplitPatternWorker(CLIConfig()) + + annotator := obiseq.SeqToSliceWorker(worker, false) + f := obiiter.SliceWorkerPipe(annotator, false, obioptions.CLIParallelWorkers()) + + return f +} diff --git a/pkg/obitools/obisplit/options.go b/pkg/obitools/obisplit/options.go new file mode 100644 index 0000000..5e94961 --- /dev/null +++ b/pkg/obitools/obisplit/options.go @@ -0,0 +1,135 @@ +package obisplit + +import ( + "encoding/csv" + "fmt" + "log" + "os" + "slices" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _askTemplate = false +var _config = "" +var _pattern_error = 4 +var _pattern_indel = false + +func SplitOptionSet(options *getoptions.GetOpt) { + + options.StringVar(&_config, "config", _config, + options.Description("The configuration file."), + options.Alias("C")) + + options.BoolVar(&_askTemplate, "template", _askTemplate, + options.Description("Print on the standard output a script template."), + ) + + options.IntVar(&_pattern_error, "pattern-error", _pattern_error, + options.Description("Maximum number of allowed error during pattern matching"), + ) + + options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel, + options.Description("Allows for indel during pattern matching"), + ) + +} + +func OptionSet(options *getoptions.GetOpt) { + SplitOptionSet(options) + obiconvert.OptionSet(options) +} + +func CLIHasConfig() bool { + return CLIConfigFile() != "" +} + +func CLIConfigFile() string { + return _config +} + +func CLIConfig() []SplitSequence { + // os.Open() opens specific file in + // read-only mode and this return + // a pointer of type os.File + file, err := os.Open(CLIConfigFile()) + + // Checks for the error + if err != nil { + log.Fatal("Error while reading the file", err) + } + + // Closes the file + defer file.Close() + + reader := csv.NewReader(file) + records, err := reader.ReadAll() + + // Checks for the error + if err != nil { + fmt.Println("Error reading records") + } + + config := make([]SplitSequence, 0, max(0, len(records)-1)) + + header := records[0] + + pattern_idx := slices.Index(header, "T-tag") + pool_idx := slices.Index(header, "pcr_pool") + + if pattern_idx == -1 { + log.Fatalf("Config file %s doesn't contain `T-tag`column", CLIConfigFile()) + } + + if pool_idx == -1 { + pool_idx = pattern_idx + } + + // Loop to iterate through + // and print each of the string slice + for _, eachrecord := range records[1:] { + + fp, err := obiapat.MakeApatPattern(eachrecord[pattern_idx], + CLIPatternError(), CLIPatternInDels()) + if err != nil { + log.Fatalf("Error cannot compile pattern %s : %v", + eachrecord[pattern_idx], err) + } + + rv, err := fp.ReverseComplement() + if err != nil { + log.Fatalf("Error cannot reverse complement pattern %s: %v", + eachrecord[pattern_idx], err) + } + + config = append(config, SplitSequence{ + pattern: eachrecord[pattern_idx], + name: eachrecord[pool_idx], + forward_pattern: fp, + reverse_pattern: rv, + }) + } + + return config +} + +func CLIPatternError() int { + return _pattern_error +} + +func CLIPatternInDels() bool { + return _pattern_indel +} + +func CLIAskConfigTemplate() bool { + return _askTemplate +} + +func CLIConfigTemplate() string { + return `T-tag,pcr_pool +CGGCACCTGTTACGCAGCCACTATCGGCT,pool_1 +CGGCAAGACCCTATTGCATTGGCGCGGCT,pool_2 +` +}