First version of obisplit and patch a bug in the new workers API

Former-commit-id: f28af9f104c08d68e29fd866739d8dd58241da63
This commit is contained in:
2024-03-03 11:16:24 -04:00
parent 0f3871d203
commit 7bd073ccd4
7 changed files with 507 additions and 37 deletions

View File

@@ -313,7 +313,7 @@ func CLIAnnotationPipeline() obiiter.Pipeable {
predicate := obigrep.CLISequenceSelectionPredicate()
worker := CLIAnnotationWorker()
annotator := obiseq.SeqToSliceConditionalWorker(predicate, worker, true, false)
annotator := obiseq.SeqToSliceConditionalWorker(predicate, worker, false)
f := obiiter.SliceWorkerPipe(annotator, false, obioptions.CLIParallelWorkers())
return f

View File

@@ -0,0 +1,257 @@
package obisplit
import (
"fmt"
"sort"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
type SplitSequence struct {
pattern string
name string
forward_pattern obiapat.ApatPattern
reverse_pattern obiapat.ApatPattern
}
type Pattern_match struct {
name string
pattern string
match string
begin int
end int
nerrors int
forward bool
}
func LocatePatterns(sequence *obiseq.BioSequence,
patterns []SplitSequence) []Pattern_match {
aseq, err := obiapat.MakeApatSequence(sequence, false)
if err != nil {
log.Fatalf("Cannot index sequence %s for patern matching", sequence.Id())
}
res := make([]Pattern_match, 0, 10)
for _, split := range patterns {
ms := split.forward_pattern.AllMatches(aseq, 0, aseq.Len())
for _, m := range ms {
m[0] = max(0, m[0])
m[1] = min(sequence.Len(), m[1])
match, err := sequence.Subsequence(m[0], m[1], false)
if err != nil {
log.Fatalf("Cannot extract pattern %s from sequence %s", split.pattern, sequence.Id())
}
res = append(res, Pattern_match{
name: split.name,
pattern: split.pattern,
match: match.String(),
begin: m[0],
end: m[1],
nerrors: m[2],
forward: true,
})
}
ms = split.reverse_pattern.AllMatches(aseq, 0, aseq.Len())
for _, m := range ms {
m[0] = max(0, m[0])
m[1] = min(sequence.Len(), m[1])
match, err := sequence.Subsequence(m[0], m[1], false)
if err != nil {
log.Fatalf("Cannot extract reverse pattern %s from sequence %s", split.pattern, sequence.Id())
}
match = match.ReverseComplement(true)
res = append(res, Pattern_match{
name: split.name,
pattern: split.pattern,
match: match.String(),
begin: m[0],
end: m[1],
nerrors: m[2],
forward: false,
})
}
}
sort.Slice(res, func(i, j int) bool {
a := res[i].begin
b := res[j].begin
return a < b
})
log.Debugf("Sequence %s Raw match : %v", sequence.Id(), res)
if len(res) > 1 {
j := 0
m1 := res[0]
for _, m2 := range res[1:] {
if m2.begin < m1.end {
if m2.nerrors < m1.nerrors {
m1 = m2
}
continue
}
res[j] = m1
m1 = m2
j++
}
res[j] = m1
res = res[:j+1]
}
log.Debugf("Sequence %s No overlap match : %v", sequence.Id(), res)
return res
}
func SplitPattern(sequence *obiseq.BioSequence,
patterns []SplitSequence) (obiseq.BioSequenceSlice, error) {
matches := LocatePatterns(sequence, patterns)
from := Pattern_match{
name: "5extremity",
pattern: "",
match: "",
begin: 0,
end: 0,
nerrors: 0,
forward: true,
}
res := obiseq.MakeBioSequenceSlice(10)
nfrag := 0
res = res[:nfrag]
for i, to := range matches {
log.Debugf("from : %v to : %v", from, to)
start := from.end
end := to.begin
if i == 0 && end <= 0 {
from = to
continue
}
if end > start {
log.Debugf("Extracting fragment %d from sequence %s [%d:%d]",
nfrag+1, sequence.Id(),
start, end,
)
sub, err := sequence.Subsequence(start, end, false)
if err != nil {
return res[:nfrag],
fmt.Errorf("cannot extract fragment %d from sequence %s [%d:%d]",
nfrag+1, sequence.Id(),
start, end,
)
}
nfrag++
sub.SetAttribute("obisplit_frg", nfrag)
if from.name == to.name {
sub.SetAttribute("obisplit_group", from.name)
} else {
sub.SetAttribute("obisplit_group", fmt.Sprintf("%s-%s", from.name, to.name))
}
sub.SetAttribute("obisplit_location", fmt.Sprintf("%d..%d", start, end))
sub.SetAttribute("obisplit_right_error", to.nerrors)
sub.SetAttribute("obisplit_left_error", from.nerrors)
sub.SetAttribute("obisplit_right_pattern", to.pattern)
sub.SetAttribute("obisplit_left_pattern", from.pattern)
sub.SetAttribute("obisplit_left_match", from.match)
sub.SetAttribute("obisplit_right_match", to.match)
res = append(res, sub)
}
from = to
}
if from.end < sequence.Len() {
to := Pattern_match{
name: "3extremity",
pattern: "",
match: "",
begin: sequence.Len(),
end: sequence.Len(),
nerrors: 0,
forward: true,
}
start := from.end
end := to.begin
sub, err := sequence.Subsequence(start, end, false)
if err != nil {
return res[:nfrag],
fmt.Errorf("cannot extract last fragment %d from sequence %s [%d:%d]",
nfrag+1, sequence.Id(),
start, end,
)
}
nfrag++
sub.SetAttribute("obisplit_frg", nfrag)
if from.name == to.name {
sub.SetAttribute("obisplit_group", from.name)
} else {
sub.SetAttribute("obisplit_group", fmt.Sprintf("%s-%s", from.name, to.name))
}
sub.SetAttribute("obisplit_location", fmt.Sprintf("%d..%d", start, end))
sub.SetAttribute("obisplit_right_error", to.nerrors)
sub.SetAttribute("obisplit_left_error", from.nerrors)
sub.SetAttribute("obisplit_right_pattern", to.pattern)
sub.SetAttribute("obisplit_left_pattern", from.pattern)
sub.SetAttribute("obisplit_left_match", from.match)
sub.SetAttribute("obisplit_right_match", to.match)
res = append(res, sub)
}
return res[:nfrag], nil
}
func SplitPatternWorker(patterns []SplitSequence) obiseq.SeqWorker {
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
return SplitPattern(sequence, patterns)
}
return f
}
func CLISlitPipeline() obiiter.Pipeable {
worker := SplitPatternWorker(CLIConfig())
annotator := obiseq.SeqToSliceWorker(worker, false)
f := obiiter.SliceWorkerPipe(annotator, false, obioptions.CLIParallelWorkers())
return f
}

View File

@@ -0,0 +1,135 @@
package obisplit
import (
"encoding/csv"
"fmt"
"log"
"os"
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _askTemplate = false
var _config = ""
var _pattern_error = 4
var _pattern_indel = false
func SplitOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_config, "config", _config,
options.Description("The configuration file."),
options.Alias("C"))
options.BoolVar(&_askTemplate, "template", _askTemplate,
options.Description("Print on the standard output a script template."),
)
options.IntVar(&_pattern_error, "pattern-error", _pattern_error,
options.Description("Maximum number of allowed error during pattern matching"),
)
options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel,
options.Description("Allows for indel during pattern matching"),
)
}
func OptionSet(options *getoptions.GetOpt) {
SplitOptionSet(options)
obiconvert.OptionSet(options)
}
func CLIHasConfig() bool {
return CLIConfigFile() != ""
}
func CLIConfigFile() string {
return _config
}
func CLIConfig() []SplitSequence {
// os.Open() opens specific file in
// read-only mode and this return
// a pointer of type os.File
file, err := os.Open(CLIConfigFile())
// Checks for the error
if err != nil {
log.Fatal("Error while reading the file", err)
}
// Closes the file
defer file.Close()
reader := csv.NewReader(file)
records, err := reader.ReadAll()
// Checks for the error
if err != nil {
fmt.Println("Error reading records")
}
config := make([]SplitSequence, 0, max(0, len(records)-1))
header := records[0]
pattern_idx := slices.Index(header, "T-tag")
pool_idx := slices.Index(header, "pcr_pool")
if pattern_idx == -1 {
log.Fatalf("Config file %s doesn't contain `T-tag`column", CLIConfigFile())
}
if pool_idx == -1 {
pool_idx = pattern_idx
}
// Loop to iterate through
// and print each of the string slice
for _, eachrecord := range records[1:] {
fp, err := obiapat.MakeApatPattern(eachrecord[pattern_idx],
CLIPatternError(), CLIPatternInDels())
if err != nil {
log.Fatalf("Error cannot compile pattern %s : %v",
eachrecord[pattern_idx], err)
}
rv, err := fp.ReverseComplement()
if err != nil {
log.Fatalf("Error cannot reverse complement pattern %s: %v",
eachrecord[pattern_idx], err)
}
config = append(config, SplitSequence{
pattern: eachrecord[pattern_idx],
name: eachrecord[pool_idx],
forward_pattern: fp,
reverse_pattern: rv,
})
}
return config
}
func CLIPatternError() int {
return _pattern_error
}
func CLIPatternInDels() bool {
return _pattern_indel
}
func CLIAskConfigTemplate() bool {
return _askTemplate
}
func CLIConfigTemplate() string {
return `T-tag,pcr_pool
CGGCACCTGTTACGCAGCCACTATCGGCT,pool_1
CGGCAAGACCCTATTGCATTGGCGCGGCT,pool_2
`
}