mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-13 18:50:26 +00:00
First version of obisplit and patch a bug in the new workers API
Former-commit-id: f28af9f104c08d68e29fd866739d8dd58241da63
This commit is contained in:
@@ -313,7 +313,7 @@ func CLIAnnotationPipeline() obiiter.Pipeable {
|
||||
predicate := obigrep.CLISequenceSelectionPredicate()
|
||||
worker := CLIAnnotationWorker()
|
||||
|
||||
annotator := obiseq.SeqToSliceConditionalWorker(predicate, worker, true, false)
|
||||
annotator := obiseq.SeqToSliceConditionalWorker(predicate, worker, false)
|
||||
f := obiiter.SliceWorkerPipe(annotator, false, obioptions.CLIParallelWorkers())
|
||||
|
||||
return f
|
||||
|
||||
257
pkg/obitools/obisplit/obisplit.go
Normal file
257
pkg/obitools/obisplit/obisplit.go
Normal file
@@ -0,0 +1,257 @@
|
||||
package obisplit
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
type SplitSequence struct {
|
||||
pattern string
|
||||
name string
|
||||
forward_pattern obiapat.ApatPattern
|
||||
reverse_pattern obiapat.ApatPattern
|
||||
}
|
||||
|
||||
type Pattern_match struct {
|
||||
name string
|
||||
pattern string
|
||||
match string
|
||||
begin int
|
||||
end int
|
||||
nerrors int
|
||||
forward bool
|
||||
}
|
||||
|
||||
func LocatePatterns(sequence *obiseq.BioSequence,
|
||||
patterns []SplitSequence) []Pattern_match {
|
||||
|
||||
aseq, err := obiapat.MakeApatSequence(sequence, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot index sequence %s for patern matching", sequence.Id())
|
||||
}
|
||||
|
||||
res := make([]Pattern_match, 0, 10)
|
||||
|
||||
for _, split := range patterns {
|
||||
ms := split.forward_pattern.AllMatches(aseq, 0, aseq.Len())
|
||||
for _, m := range ms {
|
||||
m[0] = max(0, m[0])
|
||||
m[1] = min(sequence.Len(), m[1])
|
||||
match, err := sequence.Subsequence(m[0], m[1], false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot extract pattern %s from sequence %s", split.pattern, sequence.Id())
|
||||
}
|
||||
|
||||
res = append(res, Pattern_match{
|
||||
name: split.name,
|
||||
pattern: split.pattern,
|
||||
match: match.String(),
|
||||
begin: m[0],
|
||||
end: m[1],
|
||||
nerrors: m[2],
|
||||
forward: true,
|
||||
})
|
||||
}
|
||||
|
||||
ms = split.reverse_pattern.AllMatches(aseq, 0, aseq.Len())
|
||||
for _, m := range ms {
|
||||
m[0] = max(0, m[0])
|
||||
m[1] = min(sequence.Len(), m[1])
|
||||
match, err := sequence.Subsequence(m[0], m[1], false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot extract reverse pattern %s from sequence %s", split.pattern, sequence.Id())
|
||||
}
|
||||
|
||||
match = match.ReverseComplement(true)
|
||||
|
||||
res = append(res, Pattern_match{
|
||||
name: split.name,
|
||||
pattern: split.pattern,
|
||||
match: match.String(),
|
||||
begin: m[0],
|
||||
end: m[1],
|
||||
nerrors: m[2],
|
||||
forward: false,
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
sort.Slice(res, func(i, j int) bool {
|
||||
a := res[i].begin
|
||||
b := res[j].begin
|
||||
return a < b
|
||||
})
|
||||
|
||||
log.Debugf("Sequence %s Raw match : %v", sequence.Id(), res)
|
||||
if len(res) > 1 {
|
||||
j := 0
|
||||
m1 := res[0]
|
||||
for _, m2 := range res[1:] {
|
||||
if m2.begin < m1.end {
|
||||
if m2.nerrors < m1.nerrors {
|
||||
m1 = m2
|
||||
}
|
||||
continue
|
||||
}
|
||||
res[j] = m1
|
||||
m1 = m2
|
||||
j++
|
||||
}
|
||||
|
||||
res[j] = m1
|
||||
res = res[:j+1]
|
||||
}
|
||||
|
||||
log.Debugf("Sequence %s No overlap match : %v", sequence.Id(), res)
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func SplitPattern(sequence *obiseq.BioSequence,
|
||||
patterns []SplitSequence) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
matches := LocatePatterns(sequence, patterns)
|
||||
|
||||
from := Pattern_match{
|
||||
name: "5extremity",
|
||||
pattern: "",
|
||||
match: "",
|
||||
begin: 0,
|
||||
end: 0,
|
||||
nerrors: 0,
|
||||
forward: true,
|
||||
}
|
||||
|
||||
res := obiseq.MakeBioSequenceSlice(10)
|
||||
nfrag := 0
|
||||
res = res[:nfrag]
|
||||
|
||||
for i, to := range matches {
|
||||
log.Debugf("from : %v to : %v", from, to)
|
||||
start := from.end
|
||||
end := to.begin
|
||||
|
||||
if i == 0 && end <= 0 {
|
||||
from = to
|
||||
continue
|
||||
}
|
||||
|
||||
if end > start {
|
||||
log.Debugf("Extracting fragment %d from sequence %s [%d:%d]",
|
||||
nfrag+1, sequence.Id(),
|
||||
start, end,
|
||||
)
|
||||
|
||||
sub, err := sequence.Subsequence(start, end, false)
|
||||
|
||||
if err != nil {
|
||||
return res[:nfrag],
|
||||
fmt.Errorf("cannot extract fragment %d from sequence %s [%d:%d]",
|
||||
nfrag+1, sequence.Id(),
|
||||
start, end,
|
||||
)
|
||||
}
|
||||
|
||||
nfrag++
|
||||
sub.SetAttribute("obisplit_frg", nfrag)
|
||||
|
||||
if from.name == to.name {
|
||||
sub.SetAttribute("obisplit_group", from.name)
|
||||
} else {
|
||||
sub.SetAttribute("obisplit_group", fmt.Sprintf("%s-%s", from.name, to.name))
|
||||
}
|
||||
|
||||
sub.SetAttribute("obisplit_location", fmt.Sprintf("%d..%d", start, end))
|
||||
|
||||
sub.SetAttribute("obisplit_right_error", to.nerrors)
|
||||
sub.SetAttribute("obisplit_left_error", from.nerrors)
|
||||
|
||||
sub.SetAttribute("obisplit_right_pattern", to.pattern)
|
||||
sub.SetAttribute("obisplit_left_pattern", from.pattern)
|
||||
|
||||
sub.SetAttribute("obisplit_left_match", from.match)
|
||||
sub.SetAttribute("obisplit_right_match", to.match)
|
||||
|
||||
res = append(res, sub)
|
||||
|
||||
}
|
||||
from = to
|
||||
}
|
||||
|
||||
if from.end < sequence.Len() {
|
||||
to := Pattern_match{
|
||||
name: "3extremity",
|
||||
pattern: "",
|
||||
match: "",
|
||||
begin: sequence.Len(),
|
||||
end: sequence.Len(),
|
||||
nerrors: 0,
|
||||
forward: true,
|
||||
}
|
||||
|
||||
start := from.end
|
||||
end := to.begin
|
||||
|
||||
sub, err := sequence.Subsequence(start, end, false)
|
||||
|
||||
if err != nil {
|
||||
return res[:nfrag],
|
||||
fmt.Errorf("cannot extract last fragment %d from sequence %s [%d:%d]",
|
||||
nfrag+1, sequence.Id(),
|
||||
start, end,
|
||||
)
|
||||
}
|
||||
|
||||
nfrag++
|
||||
sub.SetAttribute("obisplit_frg", nfrag)
|
||||
if from.name == to.name {
|
||||
sub.SetAttribute("obisplit_group", from.name)
|
||||
} else {
|
||||
sub.SetAttribute("obisplit_group", fmt.Sprintf("%s-%s", from.name, to.name))
|
||||
}
|
||||
sub.SetAttribute("obisplit_location", fmt.Sprintf("%d..%d", start, end))
|
||||
|
||||
sub.SetAttribute("obisplit_right_error", to.nerrors)
|
||||
sub.SetAttribute("obisplit_left_error", from.nerrors)
|
||||
|
||||
sub.SetAttribute("obisplit_right_pattern", to.pattern)
|
||||
sub.SetAttribute("obisplit_left_pattern", from.pattern)
|
||||
|
||||
sub.SetAttribute("obisplit_left_match", from.match)
|
||||
sub.SetAttribute("obisplit_right_match", to.match)
|
||||
|
||||
res = append(res, sub)
|
||||
|
||||
}
|
||||
|
||||
return res[:nfrag], nil
|
||||
}
|
||||
|
||||
func SplitPatternWorker(patterns []SplitSequence) obiseq.SeqWorker {
|
||||
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
return SplitPattern(sequence, patterns)
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CLISlitPipeline() obiiter.Pipeable {
|
||||
|
||||
worker := SplitPatternWorker(CLIConfig())
|
||||
|
||||
annotator := obiseq.SeqToSliceWorker(worker, false)
|
||||
f := obiiter.SliceWorkerPipe(annotator, false, obioptions.CLIParallelWorkers())
|
||||
|
||||
return f
|
||||
}
|
||||
135
pkg/obitools/obisplit/options.go
Normal file
135
pkg/obitools/obisplit/options.go
Normal file
@@ -0,0 +1,135 @@
|
||||
package obisplit
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"slices"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _askTemplate = false
|
||||
var _config = ""
|
||||
var _pattern_error = 4
|
||||
var _pattern_indel = false
|
||||
|
||||
func SplitOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.StringVar(&_config, "config", _config,
|
||||
options.Description("The configuration file."),
|
||||
options.Alias("C"))
|
||||
|
||||
options.BoolVar(&_askTemplate, "template", _askTemplate,
|
||||
options.Description("Print on the standard output a script template."),
|
||||
)
|
||||
|
||||
options.IntVar(&_pattern_error, "pattern-error", _pattern_error,
|
||||
options.Description("Maximum number of allowed error during pattern matching"),
|
||||
)
|
||||
|
||||
options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel,
|
||||
options.Description("Allows for indel during pattern matching"),
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
SplitOptionSet(options)
|
||||
obiconvert.OptionSet(options)
|
||||
}
|
||||
|
||||
func CLIHasConfig() bool {
|
||||
return CLIConfigFile() != ""
|
||||
}
|
||||
|
||||
func CLIConfigFile() string {
|
||||
return _config
|
||||
}
|
||||
|
||||
func CLIConfig() []SplitSequence {
|
||||
// os.Open() opens specific file in
|
||||
// read-only mode and this return
|
||||
// a pointer of type os.File
|
||||
file, err := os.Open(CLIConfigFile())
|
||||
|
||||
// Checks for the error
|
||||
if err != nil {
|
||||
log.Fatal("Error while reading the file", err)
|
||||
}
|
||||
|
||||
// Closes the file
|
||||
defer file.Close()
|
||||
|
||||
reader := csv.NewReader(file)
|
||||
records, err := reader.ReadAll()
|
||||
|
||||
// Checks for the error
|
||||
if err != nil {
|
||||
fmt.Println("Error reading records")
|
||||
}
|
||||
|
||||
config := make([]SplitSequence, 0, max(0, len(records)-1))
|
||||
|
||||
header := records[0]
|
||||
|
||||
pattern_idx := slices.Index(header, "T-tag")
|
||||
pool_idx := slices.Index(header, "pcr_pool")
|
||||
|
||||
if pattern_idx == -1 {
|
||||
log.Fatalf("Config file %s doesn't contain `T-tag`column", CLIConfigFile())
|
||||
}
|
||||
|
||||
if pool_idx == -1 {
|
||||
pool_idx = pattern_idx
|
||||
}
|
||||
|
||||
// Loop to iterate through
|
||||
// and print each of the string slice
|
||||
for _, eachrecord := range records[1:] {
|
||||
|
||||
fp, err := obiapat.MakeApatPattern(eachrecord[pattern_idx],
|
||||
CLIPatternError(), CLIPatternInDels())
|
||||
if err != nil {
|
||||
log.Fatalf("Error cannot compile pattern %s : %v",
|
||||
eachrecord[pattern_idx], err)
|
||||
}
|
||||
|
||||
rv, err := fp.ReverseComplement()
|
||||
if err != nil {
|
||||
log.Fatalf("Error cannot reverse complement pattern %s: %v",
|
||||
eachrecord[pattern_idx], err)
|
||||
}
|
||||
|
||||
config = append(config, SplitSequence{
|
||||
pattern: eachrecord[pattern_idx],
|
||||
name: eachrecord[pool_idx],
|
||||
forward_pattern: fp,
|
||||
reverse_pattern: rv,
|
||||
})
|
||||
}
|
||||
|
||||
return config
|
||||
}
|
||||
|
||||
func CLIPatternError() int {
|
||||
return _pattern_error
|
||||
}
|
||||
|
||||
func CLIPatternInDels() bool {
|
||||
return _pattern_indel
|
||||
}
|
||||
|
||||
func CLIAskConfigTemplate() bool {
|
||||
return _askTemplate
|
||||
}
|
||||
|
||||
func CLIConfigTemplate() string {
|
||||
return `T-tag,pcr_pool
|
||||
CGGCACCTGTTACGCAGCCACTATCGGCT,pool_1
|
||||
CGGCAAGACCCTATTGCATTGGCGCGGCT,pool_2
|
||||
`
|
||||
}
|
||||
Reference in New Issue
Block a user