First version of obisplit and patch a bug in the new workers API

Former-commit-id: f28af9f104c08d68e29fd866739d8dd58241da63
This commit is contained in:
2024-03-03 11:16:24 -04:00
parent 0f3871d203
commit 7bd073ccd4
7 changed files with 507 additions and 37 deletions

View File

@ -0,0 +1,57 @@
package main
import (
"fmt"
"os"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obisplit"
)
func main() {
defer obiseq.LogBioSeqStatus()
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
// f, err := os.Create("cpu.pprof")
// if err != nil {
// log.Fatal(err)
// }
// pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile()
// go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
optionParser := obioptions.GenerateOptionParser(obisplit.OptionSet)
_, args := optionParser(os.Args)
if obisplit.CLIAskConfigTemplate() {
fmt.Print(obisplit.CLIConfigTemplate())
os.Exit(0)
}
sequences, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
log.Errorf("Cannot open file (%v)", err)
os.Exit(1)
}
annotator := obisplit.CLISlitPipeline()
obiconvert.CLIWriteBioSequences(sequences.Pipe(annotator), true)
obiiter.WaitForLastPipe()
}

View File

@ -214,7 +214,7 @@ func MakeApatSequence(sequence *obiseq.BioSequence, circular bool, recycle ...Ap
var errmsg *C.char
if apat_p != nil && apat_p.pointer != nil {
log.Debugf("Finaliser called on %p\n", apat_p.pointer)
// log.Debugf("Finaliser called on %p\n", apat_p.pointer)
C.delete_apatseq(apat_p.pointer, &errno, &errmsg)
}
})
@ -376,3 +376,37 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (
// func AllocatedApaSequences() int {
// return int(_AllocatedApaSequences)
// }
func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int) (loc [][3]int) {
res := pattern.FindAllIndex(sequence, begin, length)
sbuffer := [(int(C.MAX_PAT_LEN) + int(C.MAX_PAT_ERR) + 1) * (int(C.MAX_PAT_LEN) + 1)]uint64{}
buffer := sbuffer[:]
for _, m := range res {
if m[2] > 0 && pattern.pointer.pointer.hasIndel {
start := m[0] - m[2]
end := m[0] + int(pattern.pointer.pointer.patlen) + m[2]
start = obiutils.MaxInt(start, 0)
end = obiutils.MinInt(end, sequence.Len())
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
frg := sequence.pointer.reference.Sequence()[start:end]
score, lali := obialign.FastLCSEGFScoreByte(
frg,
(*cpattern)[0:int(pattern.pointer.pointer.patlen)],
m[2], true, &buffer)
// log.Debugf("seq[%d] : %s %d, %d", i, sequence.pointer.reference.Id(), score, lali)
m[2] = lali - score
m[0] = m[0] + int(pattern.pointer.pointer.patlen) - lali
m[1] = m[0] + lali
}
}
log.Debugf("All matches : %v", res)
return res
}

View File

@ -34,7 +34,7 @@ func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker,
}()
sw := obiseq.SeqToSliceWorker(worker, true, breakOnError)
sw := obiseq.SeqToSliceWorker(worker, breakOnError)
f := func(iterator IBioSequence) {
var err error
@ -90,7 +90,7 @@ func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePre
}()
sw := obiseq.SeqToSliceConditionalWorker(predicate, worker, true, breakOnError)
sw := obiseq.SeqToSliceConditionalWorker(predicate, worker, breakOnError)
f := func(iterator IBioSequence) {
var err error

View File

@ -25,37 +25,27 @@ func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
}
func SeqToSliceWorker(worker SeqWorker,
inplace, breakOnError bool) SeqSliceWorker {
breakOnError bool) SeqSliceWorker {
var f SeqSliceWorker
if worker == nil {
if inplace {
f = func(input BioSequenceSlice) (BioSequenceSlice, error) {
return input, nil
}
} else {
f = func(input BioSequenceSlice) (BioSequenceSlice, error) {
output := MakeBioSequenceSlice(len(input))
copy(output, input)
return output, nil
}
f = func(input BioSequenceSlice) (BioSequenceSlice, error) {
return input, nil
}
} else {
f = func(input BioSequenceSlice) (BioSequenceSlice, error) {
output := input
if !inplace {
output = MakeBioSequenceSlice(len(input))
}
output := MakeBioSequenceSlice(len(input))
i := 0
for _, s := range input {
r, err := worker(s)
if err == nil {
for _, rs := range r {
if i == len(output) {
output = slices.Grow(output, cap(output))
output = output[:cap(output)]
}
output[i] = rs
i++
if i == cap(output) {
slices.Grow(output, cap(output))
}
}
} else {
@ -64,8 +54,8 @@ func SeqToSliceWorker(worker SeqWorker,
s.Id(), err)
return BioSequenceSlice{}, err
} else {
log.Warnf("got an error on sequence %s processing",
s.Id())
log.Warnf("got an error on sequence %s processing : %v",
s.Id(), err)
}
}
}
@ -80,18 +70,14 @@ func SeqToSliceWorker(worker SeqWorker,
func SeqToSliceConditionalWorker(
condition SequencePredicate,
worker SeqWorker,
inplace, breakOnError bool) SeqSliceWorker {
worker SeqWorker, breakOnError bool) SeqSliceWorker {
if condition == nil {
return SeqToSliceWorker(worker, inplace, breakOnError)
return SeqToSliceWorker(worker, breakOnError)
}
f := func(input BioSequenceSlice) (BioSequenceSlice, error) {
output := input
if !inplace {
output = MakeBioSequenceSlice(len(input))
}
output := MakeBioSequenceSlice(len(input))
i := 0
@ -100,11 +86,12 @@ func SeqToSliceConditionalWorker(
r, err := worker(s)
if err == nil {
for _, rs := range r {
if i == len(output) {
output = slices.Grow(output, cap(output))
output = output[:cap(output)]
}
output[i] = rs
i++
if i == cap(output) {
slices.Grow(output, cap(output))
}
}
} else {
if breakOnError {
@ -112,8 +99,8 @@ func SeqToSliceConditionalWorker(
s.Id(), err)
return BioSequenceSlice{}, err
} else {
log.Warnf("got an error on sequence %s processing",
s.Id())
log.Warnf("got an error on sequence %s processing : %v",
s.Id(), err)
}
}
}
@ -134,7 +121,7 @@ func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker {
}
}
sw := SeqToSliceWorker(next, true, false)
sw := SeqToSliceWorker(next, false)
f := func(seq *BioSequence) (BioSequenceSlice, error) {
if seq == nil {

View File

@ -313,7 +313,7 @@ func CLIAnnotationPipeline() obiiter.Pipeable {
predicate := obigrep.CLISequenceSelectionPredicate()
worker := CLIAnnotationWorker()
annotator := obiseq.SeqToSliceConditionalWorker(predicate, worker, true, false)
annotator := obiseq.SeqToSliceConditionalWorker(predicate, worker, false)
f := obiiter.SliceWorkerPipe(annotator, false, obioptions.CLIParallelWorkers())
return f

View File

@ -0,0 +1,257 @@
package obisplit
import (
"fmt"
"sort"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
type SplitSequence struct {
pattern string
name string
forward_pattern obiapat.ApatPattern
reverse_pattern obiapat.ApatPattern
}
type Pattern_match struct {
name string
pattern string
match string
begin int
end int
nerrors int
forward bool
}
func LocatePatterns(sequence *obiseq.BioSequence,
patterns []SplitSequence) []Pattern_match {
aseq, err := obiapat.MakeApatSequence(sequence, false)
if err != nil {
log.Fatalf("Cannot index sequence %s for patern matching", sequence.Id())
}
res := make([]Pattern_match, 0, 10)
for _, split := range patterns {
ms := split.forward_pattern.AllMatches(aseq, 0, aseq.Len())
for _, m := range ms {
m[0] = max(0, m[0])
m[1] = min(sequence.Len(), m[1])
match, err := sequence.Subsequence(m[0], m[1], false)
if err != nil {
log.Fatalf("Cannot extract pattern %s from sequence %s", split.pattern, sequence.Id())
}
res = append(res, Pattern_match{
name: split.name,
pattern: split.pattern,
match: match.String(),
begin: m[0],
end: m[1],
nerrors: m[2],
forward: true,
})
}
ms = split.reverse_pattern.AllMatches(aseq, 0, aseq.Len())
for _, m := range ms {
m[0] = max(0, m[0])
m[1] = min(sequence.Len(), m[1])
match, err := sequence.Subsequence(m[0], m[1], false)
if err != nil {
log.Fatalf("Cannot extract reverse pattern %s from sequence %s", split.pattern, sequence.Id())
}
match = match.ReverseComplement(true)
res = append(res, Pattern_match{
name: split.name,
pattern: split.pattern,
match: match.String(),
begin: m[0],
end: m[1],
nerrors: m[2],
forward: false,
})
}
}
sort.Slice(res, func(i, j int) bool {
a := res[i].begin
b := res[j].begin
return a < b
})
log.Debugf("Sequence %s Raw match : %v", sequence.Id(), res)
if len(res) > 1 {
j := 0
m1 := res[0]
for _, m2 := range res[1:] {
if m2.begin < m1.end {
if m2.nerrors < m1.nerrors {
m1 = m2
}
continue
}
res[j] = m1
m1 = m2
j++
}
res[j] = m1
res = res[:j+1]
}
log.Debugf("Sequence %s No overlap match : %v", sequence.Id(), res)
return res
}
func SplitPattern(sequence *obiseq.BioSequence,
patterns []SplitSequence) (obiseq.BioSequenceSlice, error) {
matches := LocatePatterns(sequence, patterns)
from := Pattern_match{
name: "5extremity",
pattern: "",
match: "",
begin: 0,
end: 0,
nerrors: 0,
forward: true,
}
res := obiseq.MakeBioSequenceSlice(10)
nfrag := 0
res = res[:nfrag]
for i, to := range matches {
log.Debugf("from : %v to : %v", from, to)
start := from.end
end := to.begin
if i == 0 && end <= 0 {
from = to
continue
}
if end > start {
log.Debugf("Extracting fragment %d from sequence %s [%d:%d]",
nfrag+1, sequence.Id(),
start, end,
)
sub, err := sequence.Subsequence(start, end, false)
if err != nil {
return res[:nfrag],
fmt.Errorf("cannot extract fragment %d from sequence %s [%d:%d]",
nfrag+1, sequence.Id(),
start, end,
)
}
nfrag++
sub.SetAttribute("obisplit_frg", nfrag)
if from.name == to.name {
sub.SetAttribute("obisplit_group", from.name)
} else {
sub.SetAttribute("obisplit_group", fmt.Sprintf("%s-%s", from.name, to.name))
}
sub.SetAttribute("obisplit_location", fmt.Sprintf("%d..%d", start, end))
sub.SetAttribute("obisplit_right_error", to.nerrors)
sub.SetAttribute("obisplit_left_error", from.nerrors)
sub.SetAttribute("obisplit_right_pattern", to.pattern)
sub.SetAttribute("obisplit_left_pattern", from.pattern)
sub.SetAttribute("obisplit_left_match", from.match)
sub.SetAttribute("obisplit_right_match", to.match)
res = append(res, sub)
}
from = to
}
if from.end < sequence.Len() {
to := Pattern_match{
name: "3extremity",
pattern: "",
match: "",
begin: sequence.Len(),
end: sequence.Len(),
nerrors: 0,
forward: true,
}
start := from.end
end := to.begin
sub, err := sequence.Subsequence(start, end, false)
if err != nil {
return res[:nfrag],
fmt.Errorf("cannot extract last fragment %d from sequence %s [%d:%d]",
nfrag+1, sequence.Id(),
start, end,
)
}
nfrag++
sub.SetAttribute("obisplit_frg", nfrag)
if from.name == to.name {
sub.SetAttribute("obisplit_group", from.name)
} else {
sub.SetAttribute("obisplit_group", fmt.Sprintf("%s-%s", from.name, to.name))
}
sub.SetAttribute("obisplit_location", fmt.Sprintf("%d..%d", start, end))
sub.SetAttribute("obisplit_right_error", to.nerrors)
sub.SetAttribute("obisplit_left_error", from.nerrors)
sub.SetAttribute("obisplit_right_pattern", to.pattern)
sub.SetAttribute("obisplit_left_pattern", from.pattern)
sub.SetAttribute("obisplit_left_match", from.match)
sub.SetAttribute("obisplit_right_match", to.match)
res = append(res, sub)
}
return res[:nfrag], nil
}
func SplitPatternWorker(patterns []SplitSequence) obiseq.SeqWorker {
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
return SplitPattern(sequence, patterns)
}
return f
}
func CLISlitPipeline() obiiter.Pipeable {
worker := SplitPatternWorker(CLIConfig())
annotator := obiseq.SeqToSliceWorker(worker, false)
f := obiiter.SliceWorkerPipe(annotator, false, obioptions.CLIParallelWorkers())
return f
}

View File

@ -0,0 +1,135 @@
package obisplit
import (
"encoding/csv"
"fmt"
"log"
"os"
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _askTemplate = false
var _config = ""
var _pattern_error = 4
var _pattern_indel = false
func SplitOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_config, "config", _config,
options.Description("The configuration file."),
options.Alias("C"))
options.BoolVar(&_askTemplate, "template", _askTemplate,
options.Description("Print on the standard output a script template."),
)
options.IntVar(&_pattern_error, "pattern-error", _pattern_error,
options.Description("Maximum number of allowed error during pattern matching"),
)
options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel,
options.Description("Allows for indel during pattern matching"),
)
}
func OptionSet(options *getoptions.GetOpt) {
SplitOptionSet(options)
obiconvert.OptionSet(options)
}
func CLIHasConfig() bool {
return CLIConfigFile() != ""
}
func CLIConfigFile() string {
return _config
}
func CLIConfig() []SplitSequence {
// os.Open() opens specific file in
// read-only mode and this return
// a pointer of type os.File
file, err := os.Open(CLIConfigFile())
// Checks for the error
if err != nil {
log.Fatal("Error while reading the file", err)
}
// Closes the file
defer file.Close()
reader := csv.NewReader(file)
records, err := reader.ReadAll()
// Checks for the error
if err != nil {
fmt.Println("Error reading records")
}
config := make([]SplitSequence, 0, max(0, len(records)-1))
header := records[0]
pattern_idx := slices.Index(header, "T-tag")
pool_idx := slices.Index(header, "pcr_pool")
if pattern_idx == -1 {
log.Fatalf("Config file %s doesn't contain `T-tag`column", CLIConfigFile())
}
if pool_idx == -1 {
pool_idx = pattern_idx
}
// Loop to iterate through
// and print each of the string slice
for _, eachrecord := range records[1:] {
fp, err := obiapat.MakeApatPattern(eachrecord[pattern_idx],
CLIPatternError(), CLIPatternInDels())
if err != nil {
log.Fatalf("Error cannot compile pattern %s : %v",
eachrecord[pattern_idx], err)
}
rv, err := fp.ReverseComplement()
if err != nil {
log.Fatalf("Error cannot reverse complement pattern %s: %v",
eachrecord[pattern_idx], err)
}
config = append(config, SplitSequence{
pattern: eachrecord[pattern_idx],
name: eachrecord[pool_idx],
forward_pattern: fp,
reverse_pattern: rv,
})
}
return config
}
func CLIPatternError() int {
return _pattern_error
}
func CLIPatternInDels() bool {
return _pattern_indel
}
func CLIAskConfigTemplate() bool {
return _askTemplate
}
func CLIConfigTemplate() string {
return `T-tag,pcr_pool
CGGCACCTGTTACGCAGCCACTATCGGCT,pool_1
CGGCAAGACCCTATTGCATTGGCGCGGCT,pool_2
`
}