From d70b0a5b424b7ca2bbb962f3aba9a3fc234532f8 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 16 Feb 2025 21:47:49 +0100 Subject: [PATCH] A first version of obimicroasm... --- cmd/obitools/obimicroasm/main.go | 42 +++ go.mod | 6 +- go.sum | 8 + go.work.sum | 9 +- pkg/obikmer/debruijn.go | 468 ++++++++++++++++++++++-- pkg/obikmer/oneerror.go | 45 +++ pkg/obioptions/version.go | 2 +- pkg/obiseq/paired_reads.go | 2 +- pkg/obitools/obimicroasm/microasm.go | 520 +++++++++++++++++++++++++++ pkg/obitools/obimicroasm/options.go | 139 +++++++ pkg/obiutils/unique.go | 20 ++ 11 files changed, 1234 insertions(+), 27 deletions(-) create mode 100644 cmd/obitools/obimicroasm/main.go create mode 100644 pkg/obikmer/oneerror.go create mode 100644 pkg/obitools/obimicroasm/microasm.go create mode 100644 pkg/obitools/obimicroasm/options.go create mode 100644 pkg/obiutils/unique.go diff --git a/cmd/obitools/obimicroasm/main.go b/cmd/obitools/obimicroasm/main.go new file mode 100644 index 0000000..bd5e127 --- /dev/null +++ b/cmd/obitools/obimicroasm/main.go @@ -0,0 +1,42 @@ +package main + +import ( + "os" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obimicroasm" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" +) + +func main() { + + // go tool pprof -http=":8000" ./obipairing ./cpu.pprof + // f, err := os.Create("cpu.pprof") + // if err != nil { + // log.Fatal(err) + // } + // pprof.StartCPUProfile(f) + // defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() + + optionParser := obioptions.GenerateOptionParser(obimicroasm.OptionSet) + + optionParser(os.Args) + + obidefault.SetStrictReadWorker(2) + obidefault.SetStrictWriteWorker(2) + + seq := obimicroasm.CLIAssemblePCR() + + println(obiformats.FormatFasta(seq, obiformats.FormatFastSeqJsonHeader)) + obiutils.WaitForLastPipe() +} diff --git a/go.mod b/go.mod index afcb2e0..3f8f173 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/rrethy/ahocorasick v1.0.0 github.com/schollz/progressbar/v3 v3.13.1 github.com/sirupsen/logrus v1.9.3 - github.com/stretchr/testify v1.8.4 + github.com/stretchr/testify v1.10.0 github.com/tevino/abool/v2 v2.1.0 github.com/yuin/gopher-lua v1.1.1 golang.org/x/exp v0.0.0-20231006140011-7918f672742d @@ -29,11 +29,13 @@ require ( github.com/buger/jsonparser v1.1.1 // indirect github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/ef-ds/deque/v2 v2.0.2 // indirect github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect github.com/kr/pretty v0.3.0 // indirect github.com/kr/text v0.2.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rogpeppe/go-internal v1.6.1 // indirect + go.etcd.io/bbolt v1.4.0 // indirect ) require ( @@ -47,7 +49,7 @@ require ( github.com/shopspring/decimal v1.3.1 // indirect github.com/ulikunitz/xz v0.5.11 golang.org/x/net v0.17.0 // indirect - golang.org/x/sys v0.17.0 // indirect + golang.org/x/sys v0.29.0 // indirect golang.org/x/term v0.13.0 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c ) diff --git a/go.sum b/go.sum index 9bac5f4..ba8f5ea 100644 --- a/go.sum +++ b/go.sum @@ -25,6 +25,8 @@ github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cn github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q= github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo= github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= +github.com/ef-ds/deque/v2 v2.0.2 h1:GQtDK1boBMu/qsNbSLQsqzwNptaioxZI39X3UxT5ALA= +github.com/ef-ds/deque/v2 v2.0.2/go.mod h1:hoZy4VooWLhRT4uS+sSCilfgBQUNptJU2FGqr08a5sc= github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0= github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk= github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA= @@ -77,6 +79,8 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/tevino/abool/v2 v2.1.0 h1:7w+Vf9f/5gmKT4m4qkayb33/92M+Um45F2BkHOR+L/c= github.com/tevino/abool/v2 v2.1.0/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY= github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8= @@ -84,6 +88,8 @@ github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8= github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M= github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= +go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= +go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= @@ -93,6 +99,8 @@ golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= +golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= diff --git a/go.work.sum b/go.work.sum index e29e8c9..a4fe3d1 100644 --- a/go.work.sum +++ b/go.work.sum @@ -2,12 +2,9 @@ git.sr.ht/~sbinet/gg v0.3.1 h1:LNhjNn8DerC8f9DHLz6lS0YYul/b602DUxDgGkd/Aik= git.sr.ht/~sbinet/gg v0.3.1/go.mod h1:KGYtlADtqsqANL9ueOFkWymvzUvLMQllU5Ixo+8v3pc= github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b h1:slYM766cy2nI3BwyRiyQj/Ud48djTMtMebDqepE95rw= github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b/go.mod h1:1KcenG0jGWcpt8ov532z81sp/kMMUG485J2InIOyADM= -github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0= github.com/chzyer/logex v1.1.10 h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/logex v1.2.0 h1:+eqR0HfOetur4tgnC8ftU5imRnhi4te+BadWS95c5AM= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5OhCuC+XN+r/bBCmeuuJtjz+bCNIf8= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/chzyer/test v0.0.0-20210722231415-061457976a23 h1:dZ0/VyGgQdVGAss6Ju0dt5P0QltE0SFY5Woh6hbIfiQ= @@ -29,16 +26,21 @@ github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/ianlancetaylor/demangle v0.0.0-20220319035150-800ac71e25c2 h1:rcanfLhLDA8nozr/K289V1zcntHr3V+SHlXwzz1ZI2g= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213 h1:qGQQKEcAR99REcMpsXCp3lJ03zYT1PkRd3kQGPn9GVg= github.com/klauspost/cpuid v1.2.0 h1:NMpwD2G9JSFOE1/TJjGSo5zG7Yb2bTe7eq1jH+irmeE= github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= github.com/smallnest/goroutine v1.1.1/go.mod h1:Fp8f6ZReubfdj0m4+NcUnW4IsAqKa+Pnrv9opEiD43E= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4= github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE= +go.etcd.io/gofail v0.2.0/go.mod h1:nL3ILMGfkXTekKI3clMBNazKnjUZjYLKmBHzsVAnC1o= golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= golang.org/x/image v0.6.0 h1:bR8b5okrPI3g/gyZakLZHeWxAR8Dn5CyxXv1hLH5g/4= @@ -46,6 +48,7 @@ golang.org/x/image v0.6.0/go.mod h1:MXLdDR43H7cDJq5GEGXEVeeNhPgi+YYEQ2pC1byI1x0= golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY= golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 h1:uVc8UZUe6tr40fFVnUP5Oj+veunVezqYl9z7DYw9xzw= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc= diff --git a/pkg/obikmer/debruijn.go b/pkg/obikmer/debruijn.go index a707ca7..ef6ddf1 100644 --- a/pkg/obikmer/debruijn.go +++ b/pkg/obikmer/debruijn.go @@ -8,9 +8,12 @@ import ( "math/bits" "os" "slices" + "sort" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + "github.com/ef-ds/deque/v2" log "github.com/sirupsen/logrus" ) @@ -89,12 +92,18 @@ type DeBruijnGraph struct { // // *DeBruijnGraph - a pointer to the created De Bruijn's Graph func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph { + if kmersize > 31 { + log.Panicf("k-mer size %d is too large", kmersize) + } + + kmermask := (^uint64(0) << (uint64(kmersize) * 2)) + g := DeBruijnGraph{ kmersize: kmersize, - kmermask: ^(^uint64(0) << (uint64(kmersize) * 2)), // k-mer mask used to set to 0 the bits that are not in the k-mer - prevc: uint64(1) << (uint64(kmersize-1) * 2), - prevg: uint64(2) << (uint64(kmersize-1) * 2), - prevt: uint64(3) << (uint64(kmersize-1) * 2), + kmermask: kmermask, // k-mer mask used to set to 1 the bits that are not in the k-mer + prevc: (uint64(1) << (uint64(kmersize-1) * 2)) | kmermask, + prevg: (uint64(2) << (uint64(kmersize-1) * 2)) | kmermask, + prevt: (uint64(3) << (uint64(kmersize-1) * 2)) | kmermask, graph: make(map[uint64]uint), } @@ -161,19 +170,34 @@ func (g *DeBruijnGraph) FilterMinWeight(min int) { } } +// FilterMinWeight filters the DeBruijnGraph by removing nodes with weight less than the specified minimum. +// +// min: an integer representing the minimum count threshold. +func (g *DeBruijnGraph) FilterMaxWeight(min int) { + umin := uint(min) + for idx, count := range g.graph { + if count > umin { + delete(g.graph, idx) + } + } +} + func (g *DeBruijnGraph) Previouses(index uint64) []uint64 { if _, ok := g.graph[index]; !ok { log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index) } rep := make([]uint64, 0, 4) + + index &= ^g.kmermask index >>= 2 - if _, ok := g.graph[index]; ok { - rep = append(rep, index) + key := index | g.kmermask + if _, ok := g.graph[key]; ok { + rep = append(rep, key) } - key := index | g.prevc + key = index | g.prevc if _, ok := g.graph[key]; ok { rep = append(rep, key) } @@ -197,7 +221,7 @@ func (g *DeBruijnGraph) Nexts(index uint64) []uint64 { } rep := make([]uint64, 0, 4) - index = (index << 2) & g.kmermask + index = (index << 2) | g.kmermask if _, ok := g.graph[index]; ok { rep = append(rep, index) @@ -268,6 +292,33 @@ func (g *DeBruijnGraph) MaxHead() (uint64, int, bool) { return rep, int(max), found } +func (g *DeBruijnGraph) Terminals() []uint64 { + rep := make([]uint64, 0, 10) + + for k := range g.graph { + if len(g.Nexts(k)) == 0 { + rep = append(rep, k) + } + } + + return rep +} + +func (g *DeBruijnGraph) MaxTerminal() (uint64, int, bool) { + rep := uint64(0) + max := uint(0) + found := false + for k, w := range g.graph { + if len(g.Nexts(k)) == 0 && w > max { + rep = k + max = w + found = true + } + } + + return rep, int(max), found +} + func (g *DeBruijnGraph) MaxPath() []uint64 { path := make([]uint64, 0, 1000) ok := false @@ -318,7 +369,11 @@ func (g *DeBruijnGraph) LongestConsensus(id string, min_cov float64) (*obiseq.Bi return nil, fmt.Errorf("graph is empty") } //path := g.LongestPath(max_length) - path := g.HaviestPath() + path, err := g.HaviestPath(nil, nil, false) + + if err != nil { + return nil, err + } spath := path @@ -481,7 +536,7 @@ func (graph *DeBruijnGraph) append(sequence []byte, current uint64, weight int) } current <<= 2 - current &= graph.kmermask + current |= graph.kmermask b := iupac[sequence[0]] current |= b[0] graph.graph[current] = uint(graph.Weight(current) + weight) @@ -495,6 +550,36 @@ func (graph *DeBruijnGraph) append(sequence []byte, current uint64, weight int) } } +// func (graph *DeBruijnGraph) search(current uint64, extension []byte, path []uint64, error,errormax int) ([]uint64,error) { + +// path = append(path, current) + +// if len(extension) == 0 { +// return path,nil +// } + +// current <<= 2 +// current &= graph.kmermask +// b := iupac[extension[0]] + +// newPath := path +// if len(b) > 1 { +// newPath = slices.Clone(path) +// } + +// current |= b[0] + +// _, ok := graph.graph[current] +// if ok { +// newPath = append(newPath, current) +// } +// rep, err := graph.search(current, extension[1:], newPath, error,errormax) +// if err != nil { +// return path,err +// } + +// } + // Push appends a BioSequence to the DeBruijnGraph. // // Parameters: @@ -523,6 +608,7 @@ func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) { initFirstKmer(start+1, key) } } else { + key |= graph.kmermask graph.graph[key] = uint(graph.Weight(key) + w) graph.append(s[graph.kmersize:], key, w) } @@ -533,6 +619,110 @@ func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) { } } +func (graph *DeBruijnGraph) search(sequence []byte, mismatch, errormax int) []uint64 { + var initFirstKmer func(start int, key uint64) []uint64 + + initFirstKmer = func(start int, key uint64) []uint64 { + if start == graph.kmersize { + key |= graph.kmermask + if _, ok := graph.graph[key]; ok { + return []uint64{key} + } else { + return []uint64{} + } + } + + keys := make([]uint64, 0, 1000) + + if start == 0 { + key = 0 + } + + key <<= 2 + b := iupac[sequence[start]] + + for _, code := range b { + key &= ^uint64(3) + key |= code + keys = append(keys, initFirstKmer(start+1, key)...) + } + + // w := []string{} + // for _, k := range keys { + // w = append(w, graph.DecodeNode(k)) + // } + // // log.Warnf("For %s found %d matches : %v", sequence, len(keys), w) + + return keys + } + + rep := initFirstKmer(0, 0) + + return rep +} + +func (graph *DeBruijnGraph) Search(sequence *obiseq.BioSequence, errormax int) []uint64 { + + s := sequence.Sequence() // Get the sequence as a byte slice + + if len(s) < graph.KmerSize() { + s = slices.Clone(s) + for len(s) < graph.KmerSize() { + s = append(s, 'n') + } + } + + log.Warnf("searching for %s", s) + keys := graph.search(s, 0, errormax) + + for mismatch := 1; mismatch <= errormax; mismatch++ { + log.Warnf("searching with %d error for %s", mismatch, s) + for probe := range IterateOneError(s[0:graph.kmersize]) { + keys = append(keys, + graph.search(probe, mismatch, errormax)..., + ) + } + } + keys = obiutils.Unique(keys) + + return keys +} + +func (graph *DeBruijnGraph) BackSearch(sequence *obiseq.BioSequence, errormax int) []uint64 { + lkmer := graph.KmerSize() + + s := sequence.Sequence() // Get the sequence as a byte slice + + if len(s) < lkmer { + sn := []byte{} + ls := len(s) + for ls < lkmer { + sn = append(sn, 'n') + ls++ + } + s = append(sn, s...) + } else { + s = s[(len(s) - lkmer):] + } + + log.Warnf("back-searching for %s", s) + + keys := graph.search(s, 0, errormax) + + for mismatch := 1; mismatch <= errormax; mismatch++ { + log.Warnf("searching with %d error for %s", mismatch, s) + for probe := range IterateOneError(s[0:graph.kmersize]) { + // log.Warnf("searching with %d error for %s", mismatch, probe) + keys = append(keys, + graph.search(probe, mismatch, errormax)..., + ) + } + } + + keys = obiutils.Unique(keys) + return keys +} + func (graph *DeBruijnGraph) Gml() string { buffer := bytes.NewBuffer(make([]byte, 0, 1000)) @@ -614,7 +804,7 @@ func (graph *DeBruijnGraph) WriteGml(filename string) error { func (g *DeBruijnGraph) HammingDistance(kmer1, kmer2 uint64) int { ident := ^((kmer1 & kmer2) | (^kmer1 & ^kmer2)) ident |= (ident >> 1) - ident &= 0x5555555555555555 & g.kmermask + ident &= 0x5555555555555555 & ^g.kmermask return bits.OnesCount64(ident) } @@ -638,11 +828,23 @@ func (h *UInt64Heap) Pop() any { return x } -func (g *DeBruijnGraph) HaviestPath() []uint64 { +func (g *DeBruijnGraph) HaviestPath(starts, stops []uint64, backPath bool) ([]uint64, error) { - if g.HasCycle() { - return nil + // if g.HasCycle() { + // return nil, fmt.Errorf("graph has a cycle") + // } + + following := g.Nexts + + if backPath { + following = g.Previouses } + + stopNodes := make(map[uint64]bool, len(stops)) + for _, n := range stops { + stopNodes[n] = true + } + // Initialize the distance array and visited set distances := make(map[uint64]int) visited := make(map[uint64]bool) @@ -654,7 +856,11 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 { heap.Init(queue) startNodes := make(map[uint64]struct{}) - for _, n := range g.Heads() { + if starts == nil { + starts = g.Heads() + } + + for _, n := range starts { startNodes[n] = struct{}{} heap.Push(queue, n) distances[n] = g.Weight(n) @@ -686,7 +892,11 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 { log.Warn("current node is 0") } // Update the distance of the neighbors - nextNodes := g.Nexts(currentNode) + + nextNodes := following(currentNode) + if _, ok := stopNodes[currentNode]; ok { + nextNodes = []uint64{} + } for _, nextNode := range nextNodes { if nextNode == 0 { log.Warn("next node is 0") @@ -718,16 +928,178 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 { } if slices.Contains(heaviestPath, currentNode) { - log.Panicf("Cycle detected %v -> %v (%v) len(%v), graph: %v", heaviestPath, currentNode, startNodes, len(heaviestPath), g.Len()) - return nil + return nil, fmt.Errorf("cycle detected in heaviest path") } heaviestPath = append(heaviestPath, currentNode) // Reverse the path - slices.Reverse(heaviestPath) + if !backPath { + slices.Reverse(heaviestPath) + } - return heaviestPath + return heaviestPath, nil +} + +func (g *DeBruijnGraph) HaviestPathDSU(starts, stops []uint64, backPath bool) ([]uint64, error) { + // Collect and sort edges + type Edge struct { + weight float64 + u, v uint64 + } + edges := make([]Edge, 0) + + // Function to get next nodes (either Nexts or Previouses based on backPath) + following := g.Nexts + previouses := g.Previouses + if backPath { + following = g.Previouses + previouses = g.Nexts + } + + // Collect all edges + for u := range g.graph { + for _, v := range following(u) { + edges = append(edges, Edge{ + weight: float64(min(g.Weight(u), g.Weight(v))), + u: u, + v: v, + }) + } + } + + // Sort edges by weight in descending order + sort.Slice(edges, func(i, j int) bool { + return edges[i].weight > edges[j].weight + }) + + // Initialize disjoint set data structure + parent := make(map[uint64]uint64) + for u := range g.graph { + parent[u] = u + } + + // Find with path compression + var find func(uint64) uint64 + find = func(node uint64) uint64 { + if parent[node] != node { + parent[node] = find(parent[node]) + } + return parent[node] + } + + // Union function that returns true if cycle is detected + union := func(u, v uint64) bool { + rootU := find(u) + rootV := find(v) + if rootU == rootV { + return true // Cycle detected + } + parent[rootV] = rootU + return false + } + + // If no specific starts provided, use graph heads + if starts == nil { + if !backPath { + starts = g.Heads() + } else { + starts = g.Terminals() + } + } + + // If no specific stops provided, use graph terminals + if stops == nil { + if !backPath { + stops = g.Terminals() + } else { + stops = g.Heads() + } + } + + // Convert stops to a map for O(1) lookup + stopNodes := make(map[uint64]bool) + for _, stop := range stops { + stopNodes[stop] = false + } + + var path []uint64 + maxCapacity := math.Inf(-1) + stopEdge := []Edge{} + + // Process edges in descending order of weight + for _, edge := range edges { + if stopNodes[edge.u] { + continue // Skip edges from stop nodes + } + + if in, ok := stopNodes[edge.v]; ok { + if !in { + stopEdge = append(stopEdge, edge) + stopNodes[edge.v] = true + } + } + + if union(edge.u, edge.v) { + continue // Skip if creates cycle + } + + pathFound := false + for _, sedge := range stopEdge { + // Check if any start-stop pair is connected + fv := find(sedge.v) + for _, s := range starts { + fs := find(s) + // log.Warnf("Start: %d, Stop: %d", fs, fv) + if fs == fv { + pathFound = true + maxCapacity = edge.weight + + // Reconstruct path + current := sedge.v + path = []uint64{current} + for current != s { + oldcurrent := current + // log.Warnf("Start: %d, Current: %d, Previous: %v", s, current, previouses(current)) + for _, prev := range previouses(current) { + if find(prev) == fs { + path = append(path, prev) + current = prev + break + } + } + if current == oldcurrent { + log.Fatalf("We are stuck") + } + + } + // log.Warnf("Built path: %v", path) + break + } + } + if pathFound { + break + } + } + if pathFound { + break + } + } + + // log.Warnf("Stop edge: %v", stopEdge) + + // Process edges in descending order of weight + + if path == nil { + return nil, fmt.Errorf("no valid path found") + } + + if !backPath { + slices.Reverse(path) + } + log.Warnf("Max capacity: %5.0f: %v", maxCapacity, g.DecodePath(path)) + + return path, nil } func (g *DeBruijnGraph) HasCycle() bool { @@ -765,3 +1137,59 @@ func (g *DeBruijnGraph) HasCycle() bool { } return false } + +// HasCycleInDegree détecte la présence d'un cycle dans le graphe en utilisant la méthode des degrés entrants. +// Cette méthode est basée sur le tri topologique : si on ne peut pas trier tous les nœuds, +// alors il y a un cycle. +// +// Returns: +// - bool: true si le graphe contient un cycle, false sinon +func (g *DeBruijnGraph) HasCycleInDegree() bool { + // Créer une map pour stocker les degrés entrants de chaque nœud + inDegree := make(map[uint64]int) + + // Initialiser les degrés entrants à 0 pour tous les nœuds + for node := range g.graph { + inDegree[node] = 0 + } + + // Calculer les degrés entrants + for node := range g.graph { + for _, next := range g.Nexts(node) { + inDegree[next]++ + } + } + + // Créer une deque pour stocker les nœuds avec un degré entrant de 0 + queue := deque.Deque[uint64]{} + + // Ajouter tous les nœuds avec un degré entrant de 0 à la deque + for node := range g.graph { + if inDegree[node] == 0 { + queue.PushBack(node) + } + } + + visited := 0 // Compteur de nœuds visités + + // Parcours BFS + for queue.Len() > 0 { + // Retirer le premier nœud de la deque + node, _ := queue.PopFront() + visited++ + + // Pour chaque nœud adjacent + for _, next := range g.Nexts(node) { + // Réduire son degré entrant + inDegree[next]-- + + // Si le degré entrant devient 0, l'ajouter à la deque + if inDegree[next] == 0 { + queue.PushBack(next) + } + } + } + + // S'il y a un cycle, on n'aura pas pu visiter tous les nœuds + return visited != len(g.graph) +} diff --git a/pkg/obikmer/oneerror.go b/pkg/obikmer/oneerror.go new file mode 100644 index 0000000..c518ad7 --- /dev/null +++ b/pkg/obikmer/oneerror.go @@ -0,0 +1,45 @@ +package obikmer + +import ( + "iter" + "slices" +) + +var baseError = map[byte]byte{ + 'a': 'b', + 'c': 'd', + 'g': 'h', + 't': 'v', + 'r': 'y', + 'y': 'r', + 's': 'w', + 'w': 's', + 'k': 'm', + 'm': 'k', + 'd': 'c', + 'v': 't', + 'h': 'g', + 'b': 'a', +} + +type BytesItem []byte + +func IterateOneError(kmer []byte) iter.Seq[BytesItem] { + lkmer := len(kmer) + return func(yield func(BytesItem) bool) { + for p := 0; p < lkmer; p++ { + for p < lkmer && kmer[p] == 'n' { + p++ + } + + if p < lkmer { + nkmer := slices.Clone(kmer) + nkmer[p] = baseError[kmer[p]] + if !yield(nkmer) { + return + } + } + } + } + +} diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index d6c5703..ff1e79c 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "6a8061c" +var _Commit = "4774438" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/paired_reads.go b/pkg/obiseq/paired_reads.go index 2586b1d..d8fb6c7 100644 --- a/pkg/obiseq/paired_reads.go +++ b/pkg/obiseq/paired_reads.go @@ -25,7 +25,7 @@ func (s *BioSequence) UnPair() { } func (s *BioSequenceSlice) IsPaired() bool { - return (*s)[0].paired != nil + return s != nil && s.Len() > 0 && (*s)[0].paired != nil } func (s *BioSequenceSlice) PairedWith() *BioSequenceSlice { diff --git a/pkg/obitools/obimicroasm/microasm.go b/pkg/obitools/obimicroasm/microasm.go new file mode 100644 index 0000000..034a8db --- /dev/null +++ b/pkg/obitools/obimicroasm/microasm.go @@ -0,0 +1,520 @@ +package obimicroasm + +import ( + "fmt" + "os" + "path" + "slices" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obisuffix" +) + +func BuildFilterOnPatternReadPairWorker( + forward, reverse string, + errormax int, + cutReads bool, +) obiseq.SeqWorker { + forwardPatternDir, err := obiapat.MakeApatPattern(forward, errormax, false) + + if err != nil { + log.Fatalf("Cannot compile forward primer %s : %v", forward, err) + } + + reverse_rev := obiseq.NewBioSequence("fp", []byte(reverse), "").ReverseComplement(true).String() + reveresePatternRev, err := obiapat.MakeApatPattern(reverse_rev, errormax, false) + + if err != nil { + log.Fatalf("Cannot compile reverse complement reverse primer %s : %v", reverse, err) + } + + matchRead := func(sequence *obiseq.BioSequence) *obiseq.BioSequence { + var aseq obiapat.ApatSequence + var err error + var read, match *obiseq.BioSequence + + aseq, err = obiapat.MakeApatSequence(sequence, false) + + if err != nil { + log.Fatalf("Cannot prepare apat sequence from %s : %v", sequence.Id(), err) + } + + start, end, nerr, matched := forwardPatternDir.BestMatch(aseq, 0, aseq.Len()) + + if matched { + read = sequence + + if cutReads { + read, err = sequence.Subsequence(start, sequence.Len(), false) + + if err != nil { + log.Fatalf("Cannot cut, on forward, forward read %s [%d,%d] : %v", + sequence.Id(), start, sequence.Len(), err) + } + } + + read.SetAttribute("forward_primer", forward) + match, _ = sequence.Subsequence(start, end, false) + read.SetAttribute("forward_match", match.String()) + read.SetAttribute("forward_error", nerr) + + aseq, err = obiapat.MakeApatSequence(read, false, aseq) + + if err != nil { + log.Fatalf("Cannot prepare apat sequence from %s : %v", sequence.Id(), err) + } + + start, end, nerr, matched = reveresePatternRev.BestMatch(aseq, 0, aseq.Len()) + + if matched { + + frread := read + + if cutReads { + frread, err = read.Subsequence(0, end, false) + + if err != nil { + log.Fatalf("Cannot xxx cut, on reverse, forward read %s [%d,%d] : %v", + sequence.Id(), start, read.Len(), err) + } + } + + frread.SetAttribute("reverse_primer", reverse) + match, _ = read.Subsequence(start, end, false) + frread.SetAttribute("reverse_match", match.ReverseComplement(true).String()) + frread.SetAttribute("reverse_error", nerr) + + read = frread + // log.Warnf("Forward-Reverse primer matched on %s : %d\n%s", read.Id(), read.Len(), + // obiformats.FormatFasta(read, obiformats.FormatFastSeqJsonHeader)) + } + + } else { + start, end, nerr, matched = reveresePatternRev.BestMatch(aseq, 0, aseq.Len()) + + if matched { + read = sequence + if cutReads { + read, err = sequence.Subsequence(0, end, false) + + if err != nil { + log.Fatalf("Cannot yyy cut, on reverse, forward read %s [%d,%d] : %v", + sequence.Id(), 0, end, err) + } + + } + + read.SetAttribute("reverse_primer", reverse) + match, _ = read.Subsequence(start, end, false) + read.SetAttribute("reverse_match", match.ReverseComplement(true).String()) + read.SetAttribute("reverse_error", nerr) + } else { + read = nil + } + + } + + return read + } + + w := func(sequence *obiseq.BioSequence) (result obiseq.BioSequenceSlice, err error) { + result = obiseq.MakeBioSequenceSlice() + + paired := sequence.PairedWith() + sequence.UnPair() + + read := matchRead(sequence) + + if read == nil { + sequence = sequence.ReverseComplement(true) + read = matchRead(sequence) + } + + if read != nil { + result = append(result, read) + } + + if paired != nil { + read = matchRead(paired) + + if read == nil { + read = matchRead(paired.ReverseComplement(true)) + } + + if read != nil { + result = append(result, read) + } + } + + return + } + + return w +} + +func ExtractOnPatterns(iter obiiter.IBioSequence, + forward, reverse string, + errormax int, + cutReads bool, +) obiseq.BioSequenceSlice { + + matched := iter.MakeIWorker( + BuildFilterOnPatternReadPairWorker(forward, reverse, errormax, cutReads), + false, + ) + + rep := obiseq.MakeBioSequenceSlice() + + for matched.Next() { + frgs := matched.Get() + rep = append(rep, frgs.Slice()...) + } + + return rep +} + +func BuildPCRProduct(seqs obiseq.BioSequenceSlice, + consensus_id string, + kmer_size int, + forward, reverse string, + backtrack bool, + save_graph bool, dirname string) (*obiseq.BioSequence, error) { + + from := obiseq.NewBioSequence("forward", []byte(forward), "") + to := obiseq.NewBioSequence("reverse", []byte(CLIReversePrimer()), "").ReverseComplement(true) + + if backtrack { + from, to = to, from + } + + if seqs.Len() == 0 { + return nil, fmt.Errorf("no sequence provided") + } + + if save_graph { + if dirname == "" { + dirname = "." + } + + if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() { + // path does not exist or is not directory + os.RemoveAll(dirname) + err := os.Mkdir(dirname, 0755) + + if err != nil { + log.Panicf("Cannot create directory %s for saving graphs", dirname) + } + } + + fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s_consensus.fasta", consensus_id))) + + if err == nil { + defer fasta.Close() + fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch( + fmt.Sprintf("%s_consensus", consensus_id), + 0, + seqs, + ), + obiformats.FormatFastSeqJsonHeader, false).Bytes()) + fasta.Close() + } + + } + + log.Debugf("Number of reads : %d\n", len(seqs)) + + if kmer_size < 0 { + longest := make([]int, len(seqs)) + + for i, seq := range seqs { + s := obiseq.BioSequenceSlice{seq} + sa := obisuffix.BuildSuffixArray(&s) + longest[i] = slices.Max(sa.CommonSuffix()) + } + + // spectrum := map[int]int{} + // for _, s := range longest { + // spectrum[s]++ + // } + + // log.Warnf("spectum kmer size : %v", spectrum) + + kmer_size = slices.Max(longest) + 1 + log.Infof("estimated kmer size : %d", kmer_size) + } + + var graph *obikmer.DeBruijnGraph + + var hp []uint64 + var err error + var starts []uint64 + var stops []uint64 + + for { + graph = obikmer.MakeDeBruijnGraph(kmer_size) + + for _, s := range seqs { + graph.Push(s) + } + + if !backtrack { + starts = graph.Search(from, CLIAllowedMismatch()) + stops = graph.BackSearch(to, CLIAllowedMismatch()) + } else { + starts = graph.BackSearch(from, CLIAllowedMismatch()) + stops = graph.Search(to, CLIAllowedMismatch()) + } + + log.Infof("Found %d starts", len(starts)) + pweight := map[int]int{} + for _, s := range starts { + w := graph.Weight(s) + pweight[w]++ + log.Warnf("Starts : %s (%d)\n", graph.DecodeNode(s), w) + } + + log.Infof("Found %d stops", len(stops)) + for _, s := range stops { + w := graph.Weight(s) + pweight[w]++ + log.Warnf("Stop : %s (%d)\n", graph.DecodeNode(s), w) + } + + log.Infof("Weight spectrum : %v", pweight) + + wmax := 0 + sw := 0 + for w := range pweight { + sw += w + if w > wmax { + wmax = w + } + } + + graph.FilterMinWeight(int(sw / len(pweight))) + graph.FilterMaxWeight(int(wmax * 2)) + + log.Infof("Minimum coverage : %d", int(sw/len(pweight))) + log.Infof("Maximum coverage : %d", int(wmax*2)) + + if !graph.HasCycleInDegree() { + break + } + + kmer_size++ + + if kmer_size > 31 { + break + } + + SetKmerSize(kmer_size) + log.Warnf("Cycle detected, increasing kmer size to %d\n", kmer_size) + } + + if !backtrack { + starts = graph.Search(from, CLIAllowedMismatch()) + stops = graph.BackSearch(to, CLIAllowedMismatch()) + } else { + starts = graph.BackSearch(from, CLIAllowedMismatch()) + stops = graph.Search(to, CLIAllowedMismatch()) + } + + hp, err = graph.HaviestPath(starts, stops, backtrack) + + log.Debugf("Graph size : %d\n", graph.Len()) + + maxw := graph.MaxWeight() + modew := graph.WeightMode() + meanw := graph.WeightMean() + specw := graph.WeightSpectrum() + kmer := graph.KmerSize() + + log.Warnf("Weigh mode: %d Weigth mean : %4.1f Weigth max : %d, kmer = %d", modew, meanw, maxw, kmer) + log.Warn(specw) + + if save_graph { + + file, err := os.Create(path.Join(dirname, + fmt.Sprintf("%s_consensus.gml", consensus_id))) + + if err != nil { + fmt.Println(err) + } else { + file.WriteString(graph.Gml()) + file.Close() + } + } + + if err == nil { + s := graph.DecodePath(hp) + + seq := obiseq.NewBioSequence(consensus_id, []byte(s), "") + + total_kmer := graph.Len() + sumCount := 0 + + if seq != nil { + for _, s := range seqs { + sumCount += s.Count() + } + seq.SetAttribute("obiconsensus_consensus", true) + seq.SetAttribute("obiconsensus_weight", sumCount) + seq.SetAttribute("obiconsensus_seq_length", seq.Len()) + seq.SetAttribute("obiconsensus_kmer_size", kmer_size) + seq.SetAttribute("obiconsensus_kmer_max_occur", graph.MaxWeight()) + seq.SetAttribute("obiconsensus_filtered_graph_size", graph.Len()) + seq.SetAttribute("obiconsensus_full_graph_size", total_kmer) + } + + log.Warnf("Consensus sequence : \n%s", obiformats.FormatFasta(seq, obiformats.FormatFastSeqJsonHeader)) + + return seq, nil + + } + + return nil, err +} + +func CLIAssemblePCR() *obiseq.BioSequence { + + pairs, err := CLIPairedSequence() + + if err != nil { + log.Errorf("Cannot open file (%v)", err) + os.Exit(1) + } + + matched := ExtractOnPatterns(pairs, + CLIForwardPrimer(), + CLIReversePrimer(), + CLIAllowedMismatch(), + true, + ) + + seq, err := BuildPCRProduct( + matched, + CLIGraphFilesDirectory(), + CLIKmerSize(), + CLIForwardPrimer(), + CLIReversePrimer(), + false, + CLISaveGraphToFiles(), + CLIGraphFilesDirectory()) + + if err != nil { + log.Fatalf("Cannot build the consensus sequence : %v", err) + + } + + forwardPatternDir, err := obiapat.MakeApatPattern( + CLIForwardPrimer(), + CLIAllowedMismatch(), + false) + + if err != nil { + log.Fatalf("Cannot compile forward primer %s : %v", CLIForwardPrimer(), err) + } + + reverse_rev := obiseq.NewBioSequence("fp", []byte(CLIReversePrimer()), "").ReverseComplement(true).String() + reveresePatternRev, err := obiapat.MakeApatPattern(reverse_rev, CLIAllowedMismatch(), false) + + if err != nil { + log.Fatalf("Cannot compile reverse complement reverse primer %s : %v", CLIReversePrimer(), err) + } + + aseq, err := obiapat.MakeApatSequence(seq, false) + + if err != nil { + log.Fatalf("Cannot build apat sequence: %v", err) + } + + fstart, fend, fnerr, hasfw := forwardPatternDir.BestMatch(aseq, 0, aseq.Len()) + rstart, rend, rnerr, hasrev := reveresePatternRev.BestMatch(aseq, 0, aseq.Len()) + + for hasfw && !hasrev { + var rseq *obiseq.BioSequence + rseq, err = BuildPCRProduct( + matched, + CLIGraphFilesDirectory(), + CLIKmerSize(), + CLIForwardPrimer(), + CLIReversePrimer(), + true, + CLISaveGraphToFiles(), + CLIGraphFilesDirectory()) + + if err != nil { + log.Fatalf("Cannot build Reverse PCR sequence: %v", err) + } + + kmerSize, _ := seq.GetIntAttribute("obiconsensus_kmer_size") + fp, _ := seq.Subsequence(seq.Len()-kmerSize, seq.Len(), false) + rp, _ := rseq.Subsequence(0, kmerSize, false) + rp = rp.ReverseComplement(true) + + pairs, err := CLIPairedSequence() + + if err != nil { + log.Errorf("Cannot open file (%v)", err) + os.Exit(1) + } + + nmatched := ExtractOnPatterns(pairs, + fp.String(), + rp.String(), + CLIAllowedMismatch(), + true, + ) + + in := map[string]bool{} + + for _, s := range matched { + in[s.String()] = true + } + + for _, s := range nmatched { + if !in[s.String()] { + matched = append(matched, s) + } + } + + seq, err = BuildPCRProduct( + matched, + CLIGraphFilesDirectory(), + CLIKmerSize(), + CLIForwardPrimer(), + CLIReversePrimer(), + false, + CLISaveGraphToFiles(), + CLIGraphFilesDirectory()) + + aseq, err := obiapat.MakeApatSequence(seq, false) + + if err != nil { + log.Fatalf("Cannot build apat sequence: %v", err) + } + fstart, fend, fnerr, hasfw = forwardPatternDir.BestMatch(aseq, 0, aseq.Len()) + rstart, rend, rnerr, hasrev = reveresePatternRev.BestMatch(aseq, 0, aseq.Len()) + + } + + marker, _ := seq.Subsequence(fstart, rend, false) + + marker.SetAttribute("forward_primer", CLIForwardPrimer()) + match, _ := seq.Subsequence(fstart, fend, false) + marker.SetAttribute("forward_match", match.String()) + marker.SetAttribute("forward_error", fnerr) + + marker.SetAttribute("reverse_primer", CLIReversePrimer()) + match, _ = seq.Subsequence(rstart, rend, false) + marker.SetAttribute("reverse_match", match.ReverseComplement(true).String()) + marker.SetAttribute("reverse_error", rnerr) + + return marker +} diff --git a/pkg/obitools/obimicroasm/options.go b/pkg/obitools/obimicroasm/options.go new file mode 100644 index 0000000..4ca0e01 --- /dev/null +++ b/pkg/obitools/obimicroasm/options.go @@ -0,0 +1,139 @@ +package obimicroasm + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" + log "github.com/sirupsen/logrus" +) + +var _ForwardFile = "" +var _ReverseFile = "" +var _ForwardPrimer string +var _ReversePrimer string +var _AllowedMismatch = 0 +var _kmerSize = -1 + +var _saveGraph = "__@@NOSAVE@@__" + +func MicroAsmOptionSet(options *getoptions.GetOpt) { + options.StringVar(&_ForwardFile, "forward-reads", "", + options.Alias("F"), + options.ArgName("FILENAME_F"), + options.Required("You must provide at a forward file"), + options.Description("The file names containing the forward reads")) + options.StringVar(&_ReverseFile, "reverse-reads", "", + options.Alias("R"), + options.ArgName("FILENAME_R"), + options.Required("You must provide a reverse file"), + options.Description("The file names containing the reverse reads")) + options.StringVar(&_ForwardPrimer, "forward", "", + options.Required("You must provide a forward primer"), + options.Description("The forward primer used for the electronic PCR.")) + + options.StringVar(&_ReversePrimer, "reverse", "", + options.Required("You must provide a reverse primer"), + options.Description("The reverse primer used for the electronic PCR.")) + + options.IntVar(&_AllowedMismatch, "allowed-mismatches", 0, + options.Alias("e"), + options.Description("Maximum number of mismatches allowed for each primer.")) + options.IntVar(&_kmerSize, "kmer-size", _kmerSize, + options.ArgName("SIZE"), + options.Description("The size of the kmer used to build the consensus. "+ + "Default value = -1, which means that the kmer size is estimated from the data"), + ) + + options.StringVar(&_saveGraph, "save-graph", _saveGraph, + options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+ + "The graph files follow the graphml format."), + ) + +} + +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + MicroAsmOptionSet(options) +} + +// CLIForwardPrimer returns the sequence of the forward primer as indicated by the +// --forward command line option +func CLIForwardPrimer() string { + pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch, false) + + if err != nil { + log.Fatalf("%+v", err) + } + + pattern.Free() + + return _ForwardPrimer +} + +// CLIReversePrimer returns the sequence of the reverse primer as indicated by the +// --reverse command line option +func CLIReversePrimer() string { + pattern, err := obiapat.MakeApatPattern(_ReversePrimer, _AllowedMismatch, false) + + if err != nil { + log.Fatalf("%+v", err) + } + + pattern.Free() + + return _ReversePrimer +} + +// CLIAllowedMismatch returns the allowed mistmatch count between each +// primer and the sequences as indicated by the +// --allowed-mismatches|-e command line option +func CLIAllowedMismatch() int { + return _AllowedMismatch +} + +func CLIPairedSequence() (obiiter.IBioSequence, error) { + forward, err := obiconvert.CLIReadBioSequences(_ForwardFile) + if err != nil { + return obiiter.NilIBioSequence, err + } + + reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile) + if err != nil { + return obiiter.NilIBioSequence, err + } + + paired := forward.PairTo(reverse) + + return paired, nil +} + +func CLIForwardFile() string { + return _ForwardFile +} + +// Returns true it the obliclean graphs must be saved +func CLISaveGraphToFiles() bool { + return _saveGraph != "__@@NOSAVE@@__" +} + +// It returns the directory where the graph files are saved +func CLIGraphFilesDirectory() string { + return _saveGraph +} + +// CLIKmerSize returns the value of the kmer size to use for building the consensus. +// +// The value of the kmer size is set by the user with the `-k` flag. +// The value -1 means that the kmer size is estimated as the minimum value that +// insure that no kmer are present more than one time in a sequence. +// +// No parameters. +// Returns an integer value. +func CLIKmerSize() int { + return _kmerSize +} + +func SetKmerSize(kmerSize int) { + _kmerSize = kmerSize +} diff --git a/pkg/obiutils/unique.go b/pkg/obiutils/unique.go new file mode 100644 index 0000000..f14260e --- /dev/null +++ b/pkg/obiutils/unique.go @@ -0,0 +1,20 @@ +package obiutils + +// Unique returns a new slice containing only unique values from the input slice. +// The order of elements in the output slice is not guaranteed to match the input order. +// +// Parameters: +// - slice: The input slice containing potentially duplicate values +// +// Returns: +// - A new slice containing only unique values +func Unique[T comparable](slice []T) []T { + // Create a map to track unique values + seen := Set[T]{} + + for _, v := range slice { + seen.Add(v) + } + + return seen.Members() +}